/* * Copyright (c) 2016 The WebM project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include #include /* SSE4.1 */ #include "./vp10_rtcd.h" #include "./vpx_config.h" #include "vp10/common/vp10_fwd_txfm2d_cfg.h" #include "vp10/common/vp10_txfm.h" #include "vpx_dsp/txfm_common.h" #include "vpx_ports/mem.h" static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in, int stride, int flipud, int fliplr, int shift) { if (!flipud) { in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); } else { in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); } if (fliplr) { in[0] = _mm_shufflelo_epi16(in[0], 0x1b); in[1] = _mm_shufflelo_epi16(in[1], 0x1b); in[2] = _mm_shufflelo_epi16(in[2], 0x1b); in[3] = _mm_shufflelo_epi16(in[3], 0x1b); } in[0] = _mm_cvtepi16_epi32(in[0]); in[1] = _mm_cvtepi16_epi32(in[1]); in[2] = _mm_cvtepi16_epi32(in[2]); in[3] = _mm_cvtepi16_epi32(in[3]); in[0] = _mm_slli_epi32(in[0], shift); in[1] = _mm_slli_epi32(in[1], shift); in[2] = _mm_slli_epi32(in[2], shift); in[3] = _mm_slli_epi32(in[3], shift); } // We only use stage-2 bit; // shift[0] is used in load_buffer_4x4() // shift[1] is used in txfm_func_col() // shift[2] is used in txfm_func_row() static void fdct4x4_sse4_1(__m128i *in, int bit) { const int32_t *cospi = cospi_arr[bit - cos_bit_min]; const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); __m128i s0, s1, s2, s3; __m128i u0, u1, u2, u3; __m128i v0, v1, v2, v3; s0 = _mm_add_epi32(in[0], in[3]); s1 = _mm_add_epi32(in[1], in[2]); s2 = _mm_sub_epi32(in[1], in[2]); s3 = _mm_sub_epi32(in[0], in[3]); // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit); u0 = _mm_mullo_epi32(s0, cospi32); u1 = _mm_mullo_epi32(s1, cospi32); u2 = _mm_add_epi32(u0, u1); v0 = _mm_sub_epi32(u0, u1); u3 = _mm_add_epi32(u2, rnding); v1 = _mm_add_epi32(v0, rnding); u0 = _mm_srai_epi32(u3, bit); u2 = _mm_srai_epi32(v1, bit); // btf_32_sse4_1_type1(cospi48, cospi16, s[23], u[13], bit); v0 = _mm_mullo_epi32(s2, cospi48); v1 = _mm_mullo_epi32(s3, cospi16); v2 = _mm_add_epi32(v0, v1); v3 = _mm_add_epi32(v2, rnding); u1 = _mm_srai_epi32(v3, bit); v0 = _mm_mullo_epi32(s2, cospi16); v1 = _mm_mullo_epi32(s3, cospi48); v2 = _mm_sub_epi32(v1, v0); v3 = _mm_add_epi32(v2, rnding); u3 = _mm_srai_epi32(v3, bit); // Note: shift[1] and shift[2] are zeros // Transpose 4x4 32-bit v0 = _mm_unpacklo_epi32(u0, u1); v1 = _mm_unpackhi_epi32(u0, u1); v2 = _mm_unpacklo_epi32(u2, u3); v3 = _mm_unpackhi_epi32(u2, u3); in[0] = _mm_unpacklo_epi64(v0, v2); in[1] = _mm_unpackhi_epi64(v0, v2); in[2] = _mm_unpacklo_epi64(v1, v3); in[3] = _mm_unpackhi_epi64(v1, v3); } static INLINE void write_buffer_4x4(__m128i *res, tran_low_t *output) { _mm_store_si128((__m128i *)(output + 0 * 4), res[0]); _mm_store_si128((__m128i *)(output + 1 * 4), res[1]); _mm_store_si128((__m128i *)(output + 2 * 4), res[2]); _mm_store_si128((__m128i *)(output + 3 * 4), res[3]); } // Note: // We implement vp10_fwd_txfm2d_4x4(). This function is kept here since // vp10_highbd_fht4x4_c() is not removed yet void vp10_highbd_fht4x4_sse4_1(const int16_t *input, tran_low_t *output, int stride, int tx_type) { (void)input; (void)output; (void)stride; (void)tx_type; assert(0); } static void fadst4x4_sse4_1(__m128i *in, int bit) { const int32_t *cospi = cospi_arr[bit - cos_bit_min]; const __m128i cospi8 = _mm_set1_epi32(cospi[8]); const __m128i cospi56 = _mm_set1_epi32(cospi[56]); const __m128i cospi40 = _mm_set1_epi32(cospi[40]); const __m128i cospi24 = _mm_set1_epi32(cospi[24]); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); const __m128i kZero = _mm_setzero_si128(); __m128i s0, s1, s2, s3; __m128i u0, u1, u2, u3; __m128i v0, v1, v2, v3; // stage 0 // stage 1 // stage 2 u0 = _mm_mullo_epi32(in[3], cospi8); u1 = _mm_mullo_epi32(in[0], cospi56); u2 = _mm_add_epi32(u0, u1); s0 = _mm_add_epi32(u2, rnding); s0 = _mm_srai_epi32(s0, bit); v0 = _mm_mullo_epi32(in[3], cospi56); v1 = _mm_mullo_epi32(in[0], cospi8); v2 = _mm_sub_epi32(v0, v1); s1 = _mm_add_epi32(v2, rnding); s1 = _mm_srai_epi32(s1, bit); u0 = _mm_mullo_epi32(in[1], cospi40); u1 = _mm_mullo_epi32(in[2], cospi24); u2 = _mm_add_epi32(u0, u1); s2 = _mm_add_epi32(u2, rnding); s2 = _mm_srai_epi32(s2, bit); v0 = _mm_mullo_epi32(in[1], cospi24); v1 = _mm_mullo_epi32(in[2], cospi40); v2 = _mm_sub_epi32(v0, v1); s3 = _mm_add_epi32(v2, rnding); s3 = _mm_srai_epi32(s3, bit); // stage 3 u0 = _mm_add_epi32(s0, s2); u2 = _mm_sub_epi32(s0, s2); u1 = _mm_add_epi32(s1, s3); u3 = _mm_sub_epi32(s1, s3); // stage 4 v0 = _mm_mullo_epi32(u2, cospi32); v1 = _mm_mullo_epi32(u3, cospi32); v2 = _mm_add_epi32(v0, v1); s2 = _mm_add_epi32(v2, rnding); u2 = _mm_srai_epi32(s2, bit); v2 = _mm_sub_epi32(v0, v1); s3 = _mm_add_epi32(v2, rnding); u3 = _mm_srai_epi32(s3, bit); // u0, u1, u2, u3 u2 = _mm_sub_epi32(kZero, u2); u1 = _mm_sub_epi32(kZero, u1); // u0, u2, u3, u1 // Transpose 4x4 32-bit v0 = _mm_unpacklo_epi32(u0, u2); v1 = _mm_unpackhi_epi32(u0, u2); v2 = _mm_unpacklo_epi32(u3, u1); v3 = _mm_unpackhi_epi32(u3, u1); in[0] = _mm_unpacklo_epi64(v0, v2); in[1] = _mm_unpackhi_epi64(v0, v2); in[2] = _mm_unpacklo_epi64(v1, v3); in[3] = _mm_unpackhi_epi64(v1, v3); } void vp10_fwd_txfm2d_4x4_sse4_1(const int16_t *input, tran_low_t *coeff, int input_stride, int tx_type, const int bd) { __m128i in[4]; const TXFM_2D_CFG *cfg = NULL; switch (tx_type) { case DCT_DCT: cfg = &fwd_txfm_2d_cfg_dct_dct_4; load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]); fdct4x4_sse4_1(in, cfg->cos_bit_col[2]); fdct4x4_sse4_1(in, cfg->cos_bit_row[2]); write_buffer_4x4(in, coeff); break; case ADST_DCT: cfg = &fwd_txfm_2d_cfg_adst_dct_4; load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]); fadst4x4_sse4_1(in, cfg->cos_bit_col[2]); fdct4x4_sse4_1(in, cfg->cos_bit_row[2]); write_buffer_4x4(in, coeff); break; case DCT_ADST: cfg = &fwd_txfm_2d_cfg_dct_adst_4; load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]); fdct4x4_sse4_1(in, cfg->cos_bit_col[2]); fadst4x4_sse4_1(in, cfg->cos_bit_row[2]); write_buffer_4x4(in, coeff); break; case ADST_ADST: cfg = &fwd_txfm_2d_cfg_adst_adst_4; load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]); fadst4x4_sse4_1(in, cfg->cos_bit_col[2]); fadst4x4_sse4_1(in, cfg->cos_bit_row[2]); write_buffer_4x4(in, coeff); break; default: assert(0); } (void)bd; }