Commit e8e8cd8f authored by Yi Luo's avatar Yi Luo
Browse files

Hybrid forward transforms 16x16 AVX2 optimization

- Unit tests are added for AVX2 SIMD.
- Encoder speed improvement:
  AV1 baseline and EXT_TX, three 1080p sequences at bitrate:
  800 Kbps, 2 Mbps, 6 Mbps, on i7-6700 CPU, average
  user level time reduction: 3.86%.

Change-Id: Ibbd7837ee3a831c6b1e4e471bf6c8d3fa3a19ff4
parent 29804479
......@@ -195,6 +195,7 @@ ifeq ($(ARCH_X86_64),yes)
DSP_SRCS-$(HAVE_SSSE3) += x86/fwd_txfm_ssse3_x86_64.asm
endif
DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.c
DSP_SRCS-$(HAVE_AVX2) += x86/txfm_common_avx2.h
DSP_SRCS-$(HAVE_AVX2) += x86/fwd_dct32x32_impl_avx2.h
DSP_SRCS-$(HAVE_NEON) += arm/fwd_txfm_neon.c
DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.h
......
......@@ -700,7 +700,7 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
specialize qw/aom_fdct16x16 sse2 msa/;
add_proto qw/void aom_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/aom_fdct16x16_1 sse2 msa/;
specialize qw/aom_fdct16x16_1 sse2 avx2 msa/;
add_proto qw/void aom_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/aom_fdct32x32 sse2 avx2 msa/;
......
......@@ -12,16 +12,7 @@
#include <immintrin.h> // AVX2
#include "aom_dsp/txfm_common.h"
#define pair256_set_epi16(a, b) \
_mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
(int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
(int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
(int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
#define pair256_set_epi32(a, b) \
_mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), (int)(b), (int)(a), \
(int)(b), (int)(a))
#include "aom_dsp/x86/txfm_common_avx2.h"
#if FDCT32x32_HIGH_PRECISION
static INLINE __m256i k_madd_epi32_avx2(__m256i a, __m256i b) {
......
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_DSP_X86_TXFM_COMMON_AVX2_H
#define AOM_DSP_X86_TXFM_COMMON_AVX2_H
#include <immintrin.h>
#define pair256_set_epi16(a, b) \
_mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
(int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
(int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
(int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
#define pair256_set_epi32(a, b) \
_mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), (int)(b), (int)(a), \
(int)(b), (int)(a))
#endif // AOM_DSP_X86_TXFM_COMMON_AVX2_H
......@@ -116,6 +116,7 @@ endif
AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_intrin_sse2.c
AV1_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/dct_ssse3.c
AV1_CX_SRCS-$(HAVE_AVX2) += encoder/x86/hybrid_fwd_txfm_avx2.c
ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
AV1_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_fwd_txfm_sse4.c
AV1_CX_SRCS-$(HAVE_SSE4_1) += common/x86/highbd_inv_txfm_sse4.c
......
......@@ -378,7 +378,7 @@ add_proto qw/void av1_fht8x8/, "const int16_t *input, tran_low_t *output, int st
specialize qw/av1_fht8x8 sse2/;
add_proto qw/void av1_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht16x16 sse2/;
specialize qw/av1_fht16x16 sse2 avx2/;
add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht32x32/;
......
......@@ -1709,53 +1709,50 @@ void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
void av1_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
int tx_type) {
if (tx_type == DCT_DCT) {
aom_fdct16x16_c(input, output, stride);
} else {
static const transform_2d FHT[] = {
{ fdct16, fdct16 }, // DCT_DCT
{ fadst16, fdct16 }, // ADST_DCT
{ fdct16, fadst16 }, // DCT_ADST
{ fadst16, fadst16 }, // ADST_ADST
static const transform_2d FHT[] = {
{ fdct16, fdct16 }, // DCT_DCT
{ fadst16, fdct16 }, // ADST_DCT
{ fdct16, fadst16 }, // DCT_ADST
{ fadst16, fadst16 }, // ADST_ADST
#if CONFIG_EXT_TX
{ fadst16, fdct16 }, // FLIPADST_DCT
{ fdct16, fadst16 }, // DCT_FLIPADST
{ fadst16, fadst16 }, // FLIPADST_FLIPADST
{ fadst16, fadst16 }, // ADST_FLIPADST
{ fadst16, fadst16 }, // FLIPADST_ADST
{ fidtx16, fidtx16 }, // IDTX
{ fdct16, fidtx16 }, // V_DCT
{ fidtx16, fdct16 }, // H_DCT
{ fadst16, fidtx16 }, // V_ADST
{ fidtx16, fadst16 }, // H_ADST
{ fadst16, fidtx16 }, // V_FLIPADST
{ fidtx16, fadst16 }, // H_FLIPADST
#endif // CONFIG_EXT_TX
};
const transform_2d ht = FHT[tx_type];
tran_low_t out[256];
int i, j;
tran_low_t temp_in[16], temp_out[16];
{ fadst16, fdct16 }, // FLIPADST_DCT
{ fdct16, fadst16 }, // DCT_FLIPADST
{ fadst16, fadst16 }, // FLIPADST_FLIPADST
{ fadst16, fadst16 }, // ADST_FLIPADST
{ fadst16, fadst16 }, // FLIPADST_ADST
{ fidtx16, fidtx16 }, // IDTX
{ fdct16, fidtx16 }, // V_DCT
{ fidtx16, fdct16 }, // H_DCT
{ fadst16, fidtx16 }, // V_ADST
{ fidtx16, fadst16 }, // H_ADST
{ fadst16, fidtx16 }, // V_FLIPADST
{ fidtx16, fadst16 }, // H_FLIPADST
#endif // CONFIG_EXT_TX
};
const transform_2d ht = FHT[tx_type];
tran_low_t out[256];
int i, j;
tran_low_t temp_in[16], temp_out[16];
#if CONFIG_EXT_TX
int16_t flipped_input[16 * 16];
maybe_flip_input(&input, &stride, 16, 16, flipped_input, tx_type);
int16_t flipped_input[16 * 16];
maybe_flip_input(&input, &stride, 16, 16, flipped_input, tx_type);
#endif
// Columns
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j) temp_in[j] = input[j * stride + i] * 4;
ht.cols(temp_in, temp_out);
for (j = 0; j < 16; ++j)
out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
}
// Columns
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j) temp_in[j] = input[j * stride + i] * 4;
ht.cols(temp_in, temp_out);
for (j = 0; j < 16; ++j)
out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
}
// Rows
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j) temp_in[j] = out[j + i * 16];
ht.rows(temp_in, temp_out);
for (j = 0; j < 16; ++j) output[j + i * 16] = temp_out[j];
}
// Rows
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j) temp_in[j] = out[j + i * 16];
ht.rows(temp_in, temp_out);
for (j = 0; j < 16; ++j) output[j + i * 16] = temp_out[j];
}
}
......
......@@ -9049,7 +9049,8 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
uint8_t drl1_ctx = 0;
drl1_ctx = av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type],
i + idx_offset);
tmp_rate += (tmp_rate < INT_MAX ? cpi->drl_mode_cost0[drl1_ctx][1] : 0);
tmp_rate +=
(tmp_rate < INT_MAX ? cpi->drl_mode_cost0[drl1_ctx][1] : 0);
}
if (mbmi_ext->ref_mv_count[ref_frame_type] >
......
......@@ -2481,7 +2481,13 @@ void av1_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
__m128i in0[16], in1[16];
switch (tx_type) {
case DCT_DCT: aom_fdct16x16_sse2(input, output, stride); break;
case DCT_DCT:
load_buffer_16x16(input, in0, in1, stride, 0, 0);
fdct16_sse2(in0, in1);
right_shift_16x16(in0, in1);
fdct16_sse2(in0, in1);
write_buffer_16x16(output, in0, in1, 16);
break;
case ADST_DCT:
load_buffer_16x16(input, in0, in1, stride, 0, 0);
fadst16_sse2(in0, in1);
......
This diff is collapsed.
......@@ -48,6 +48,16 @@ void highbd_fht16x16_ref(const int16_t *in, int32_t *out, int stride,
}
#endif // CONFIG_AOM_HIGHBITDEPTH
#if HAVE_AVX2
void dummy_inv_txfm(const tran_low_t *in, uint8_t *out, int stride,
int tx_type) {
(void)in;
(void)out;
(void)stride;
(void)tx_type;
}
#endif
class AV1Trans16x16HT : public libaom_test::TransformTestBase,
public ::testing::TestWithParam<Ht16x16Param> {
public:
......@@ -95,11 +105,11 @@ class AV1HighbdTrans16x16HT
num_coeffs_ = 256;
input_ = reinterpret_cast<int16_t *>(
aom_memalign(16, sizeof(int16_t) * num_coeffs_));
aom_memalign(32, sizeof(int16_t) * num_coeffs_));
output_ = reinterpret_cast<int32_t *>(
aom_memalign(16, sizeof(int32_t) * num_coeffs_));
aom_memalign(32, sizeof(int32_t) * num_coeffs_));
output_ref_ = reinterpret_cast<int32_t *>(
aom_memalign(16, sizeof(int32_t) * num_coeffs_));
aom_memalign(32, sizeof(int32_t) * num_coeffs_));
}
virtual void TearDown() {
......@@ -190,6 +200,30 @@ INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans16x16HT,
::testing::ValuesIn(kArrayHt16x16Param_sse2));
#endif // HAVE_SSE2
#if HAVE_AVX2
const Ht16x16Param kArrayHt16x16Param_avx2[] = {
make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 0, AOM_BITS_8, 256),
make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 1, AOM_BITS_8, 256),
make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 2, AOM_BITS_8, 256),
make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 3, AOM_BITS_8, 256),
#if CONFIG_EXT_TX
make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 4, AOM_BITS_8, 256),
make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 5, AOM_BITS_8, 256),
make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 6, AOM_BITS_8, 256),
make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 7, AOM_BITS_8, 256),
make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 8, AOM_BITS_8, 256),
make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 10, AOM_BITS_8, 256),
make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 11, AOM_BITS_8, 256),
make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 12, AOM_BITS_8, 256),
make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 13, AOM_BITS_8, 256),
make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 14, AOM_BITS_8, 256),
make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 15, AOM_BITS_8, 256)
#endif // CONFIG_EXT_TX
};
INSTANTIATE_TEST_CASE_P(AVX2, AV1Trans16x16HT,
::testing::ValuesIn(kArrayHt16x16Param_avx2));
#endif // HAVE_AVX2
#if HAVE_SSE4_1 && CONFIG_AOM_HIGHBITDEPTH
const HighbdHt16x16Param kArrayHBDHt16x16Param_sse4_1[] = {
make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 0, 10),
......
......@@ -878,6 +878,12 @@ INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans16x16Test,
AOM_BITS_8)));
#endif // HAVE_SSE2 && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
#if HAVE_AVX2 && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(AVX2, PartialTrans16x16Test,
::testing::Values(make_tuple(&aom_fdct16x16_1_avx2,
AOM_BITS_8)));
#endif // HAVE_AVX2 && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
#if HAVE_SSE2 && CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(
SSE2, Trans16x16DCT,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment