Commit 40f22ef8 authored by Yi Luo's avatar Yi Luo Committed by Yaowu Xu

Partial IDCT 32x32 avx2

- Function level improvement (ms):
Functions       ssse3  avx2   Percentage
idct32x32_1024  794    374    52.9%
idct32x32_135   354    169    52.2%
idct32x32_34    197    142    27.9%
idct32x32_1     n/a     26    n/a

- Integrating in default scan order.

Change-Id: I84815112b26b8a8cb800281a1cfb1706342af57d
parent ee674323
......@@ -449,18 +449,18 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
specialize qw/aom_idct16x16_1_add sse2 avx2/;
add_proto qw/void aom_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct32x32_1024_add sse2 ssse3/;
specialize qw/aom_idct32x32_1024_add sse2 ssse3 avx2/;
add_proto qw/void aom_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct32x32_135_add sse2 ssse3/;
specialize qw/aom_idct32x32_135_add sse2 ssse3 avx2/;
# Need to add 135 eob idct32x32 implementations.
$aom_idct32x32_135_add_sse2=aom_idct32x32_1024_add_sse2;
add_proto qw/void aom_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct32x32_34_add sse2 ssse3/;
specialize qw/aom_idct32x32_34_add sse2 ssse3 avx2/;
add_proto qw/void aom_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct32x32_1_add sse2/;
specialize qw/aom_idct32x32_1_add sse2 avx2/;
add_proto qw/void aom_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/aom_highbd_idct4x4_16_add sse2/;
......@@ -495,10 +495,10 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
specialize qw/aom_idct16x16_10_add sse2 avx2 neon dspr2 msa/;
add_proto qw/void aom_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct32x32_1024_add sse2 ssse3 neon dspr2 msa/;
specialize qw/aom_idct32x32_1024_add sse2 ssse3 avx2 neon dspr2 msa/;
add_proto qw/void aom_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct32x32_135_add sse2 ssse3 neon dspr2 msa/;
specialize qw/aom_idct32x32_135_add sse2 ssse3 avx2 neon dspr2 msa/;
# Need to add 135 eob idct32x32 implementations.
$aom_idct32x32_135_add_sse2=aom_idct32x32_1024_add_sse2;
$aom_idct32x32_135_add_neon=aom_idct32x32_1024_add_neon;
......@@ -506,12 +506,12 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
$aom_idct32x32_135_add_msa=aom_idct32x32_1024_add_msa;
add_proto qw/void aom_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct32x32_34_add sse2 ssse3 neon dspr2 msa/;
specialize qw/aom_idct32x32_34_add sse2 ssse3 avx2 neon dspr2 msa/;
# Need to add 34 eob idct32x32 neon implementation.
$aom_idct32x32_34_add_neon=aom_idct32x32_1024_add_neon;
add_proto qw/void aom_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct32x32_1_add sse2 neon dspr2 msa/;
specialize qw/aom_idct32x32_1_add sse2 avx2 neon dspr2 msa/;
add_proto qw/void aom_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_iwht4x4_1_add msa/;
......
This diff is collapsed.
......@@ -52,12 +52,12 @@ static INLINE void recon_and_store(const __m256i *res, uint8_t *output) {
}
#define IDCT_ROUNDING_POS (6)
static INLINE void write_buffer_16x16(__m256i *in, const int stride,
uint8_t *output) {
static INLINE void store_buffer_16xN(__m256i *in, const int stride,
uint8_t *output, int num) {
const __m256i rounding = _mm256_set1_epi16(1 << (IDCT_ROUNDING_POS - 1));
int i = 0;
while (i < 16) {
while (i < num) {
in[i] = _mm256_adds_epi16(in[i], rounding);
in[i] = _mm256_srai_epi16(in[i], IDCT_ROUNDING_POS);
recon_and_store(&in[i], output + i * stride);
......
......@@ -34,7 +34,8 @@ static INLINE void mm256_reverse_epi16(__m256i *u) {
*u = _mm256_permute2x128_si256(v, v, 1);
}
static INLINE void mm256_transpose_16x16(__m256i *in) {
// Note: in and out could have the same value
static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) {
__m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
__m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]);
__m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]);
......@@ -143,23 +144,23 @@ static INLINE void mm256_transpose_16x16(__m256i *in) {
// 86 96 a6 b6 c6 d6 e6 f6 8e ae 9e be ce de ee fe
// 87 97 a7 b7 c7 d7 e7 f7 8f 9f af bf cf df ef ff
in[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20); // 0010 0000
in[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31); // 0011 0001
in[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20);
in[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31);
in[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20);
in[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31);
in[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20);
in[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31);
in[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20);
in[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31);
in[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20);
in[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31);
in[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20);
in[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31);
in[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20);
in[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
out[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20); // 0010 0000
out[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31); // 0011 0001
out[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20);
out[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31);
out[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20);
out[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31);
out[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20);
out[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31);
out[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20);
out[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31);
out[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20);
out[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31);
out[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20);
out[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31);
out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20);
out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
}
static INLINE __m256i butter_fly(__m256i a0, __m256i a1, const __m256i cospi) {
......
......@@ -1269,6 +1269,9 @@ static void idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
else if (eob <= 34)
// non-zero coeff only in upper-left 8x8
aom_idct32x32_34_add(input, dest, stride);
else if (eob <= 135)
// non-zero coeff only in upper-left 16x16
aom_idct32x32_135_add(input, dest, stride);
#endif
else
aom_idct32x32_1024_add(input, dest, stride);
......
......@@ -158,7 +158,7 @@ void av1_idct16_avx2(__m256i *in) {
}
static void idct16(__m256i *in) {
mm256_transpose_16x16(in);
mm256_transpose_16x16(in, in);
av1_idct16_avx2(in);
}
......@@ -340,7 +340,7 @@ static void iadst16_avx2(__m256i *in) {
}
static void iadst16(__m256i *in) {
mm256_transpose_16x16(in);
mm256_transpose_16x16(in, in);
iadst16_avx2(in);
}
......@@ -358,7 +358,7 @@ static void flip_col(uint8_t **dest, int *stride, int rows) {
}
static void iidtx16(__m256i *in) {
mm256_transpose_16x16(in);
mm256_transpose_16x16(in, in);
txfm_scaling16_avx2(Sqrt2, in);
}
#endif
......@@ -445,5 +445,5 @@ void av1_iht16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest,
#endif // CONFIG_EXT_TX
default: assert(0); break;
}
write_buffer_16x16(in, stride, dest);
store_buffer_16xN(in, stride, dest, 16);
}
......@@ -953,7 +953,9 @@ void fadst16_avx2(__m256i *in) {
}
#if CONFIG_EXT_TX
static void fidtx16_avx2(__m256i *in) { txfm_scaling16_avx2(Sqrt2, in); }
static void fidtx16_avx2(__m256i *in) {
txfm_scaling16_avx2((int16_t)Sqrt2, in);
}
#endif
void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride,
......@@ -964,28 +966,28 @@ void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride,
case DCT_DCT:
load_buffer_16x16(input, stride, 0, 0, in);
fdct16_avx2(in);
mm256_transpose_16x16(in);
mm256_transpose_16x16(in, in);
right_shift_16x16(in);
fdct16_avx2(in);
break;
case ADST_DCT:
load_buffer_16x16(input, stride, 0, 0, in);
fadst16_avx2(in);
mm256_transpose_16x16(in);
mm256_transpose_16x16(in, in);
right_shift_16x16(in);
fdct16_avx2(in);
break;
case DCT_ADST:
load_buffer_16x16(input, stride, 0, 0, in);
fdct16_avx2(in);
mm256_transpose_16x16(in);
mm256_transpose_16x16(in, in);
right_shift_16x16(in);
fadst16_avx2(in);
break;
case ADST_ADST:
load_buffer_16x16(input, stride, 0, 0, in);
fadst16_avx2(in);
mm256_transpose_16x16(in);
mm256_transpose_16x16(in, in);
right_shift_16x16(in);
fadst16_avx2(in);
break;
......@@ -993,91 +995,91 @@ void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride,
case FLIPADST_DCT:
load_buffer_16x16(input, stride, 1, 0, in);
fadst16_avx2(in);
mm256_transpose_16x16(in);
mm256_transpose_16x16(in, in);
right_shift_16x16(in);
fdct16_avx2(in);
break;
case DCT_FLIPADST:
load_buffer_16x16(input, stride, 0, 1, in);
fdct16_avx2(in);
mm256_transpose_16x16(in);
mm256_transpose_16x16(in, in);
right_shift_16x16(in);
fadst16_avx2(in);
break;
case FLIPADST_FLIPADST:
load_buffer_16x16(input, stride, 1, 1, in);
fadst16_avx2(in);
mm256_transpose_16x16(in);
mm256_transpose_16x16(in, in);
right_shift_16x16(in);
fadst16_avx2(in);
break;
case ADST_FLIPADST:
load_buffer_16x16(input, stride, 0, 1, in);
fadst16_avx2(in);
mm256_transpose_16x16(in);
mm256_transpose_16x16(in, in);
right_shift_16x16(in);
fadst16_avx2(in);
break;
case FLIPADST_ADST:
load_buffer_16x16(input, stride, 1, 0, in);
fadst16_avx2(in);
mm256_transpose_16x16(in);
mm256_transpose_16x16(in, in);
right_shift_16x16(in);
fadst16_avx2(in);
break;
case IDTX:
load_buffer_16x16(input, stride, 0, 0, in);
fidtx16_avx2(in);
mm256_transpose_16x16(in);
mm256_transpose_16x16(in, in);
right_shift_16x16(in);
fidtx16_avx2(in);
break;
case V_DCT:
load_buffer_16x16(input, stride, 0, 0, in);
fdct16_avx2(in);
mm256_transpose_16x16(in);
mm256_transpose_16x16(in, in);
right_shift_16x16(in);
fidtx16_avx2(in);
break;
case H_DCT:
load_buffer_16x16(input, stride, 0, 0, in);
fidtx16_avx2(in);
mm256_transpose_16x16(in);
mm256_transpose_16x16(in, in);
right_shift_16x16(in);
fdct16_avx2(in);
break;
case V_ADST:
load_buffer_16x16(input, stride, 0, 0, in);
fadst16_avx2(in);
mm256_transpose_16x16(in);
mm256_transpose_16x16(in, in);
right_shift_16x16(in);
fidtx16_avx2(in);
break;
case H_ADST:
load_buffer_16x16(input, stride, 0, 0, in);
fidtx16_avx2(in);
mm256_transpose_16x16(in);
mm256_transpose_16x16(in, in);
right_shift_16x16(in);
fadst16_avx2(in);
break;
case V_FLIPADST:
load_buffer_16x16(input, stride, 1, 0, in);
fadst16_avx2(in);
mm256_transpose_16x16(in);
mm256_transpose_16x16(in, in);
right_shift_16x16(in);
fidtx16_avx2(in);
break;
case H_FLIPADST:
load_buffer_16x16(input, stride, 0, 1, in);
fidtx16_avx2(in);
mm256_transpose_16x16(in);
mm256_transpose_16x16(in, in);
right_shift_16x16(in);
fadst16_avx2(in);
break;
#endif // CONFIG_EXT_TX
default: assert(0); break;
}
mm256_transpose_16x16(in);
mm256_transpose_16x16(in, in);
write_buffer_16x16(in, output);
_mm256_zeroupper();
}
......@@ -1110,10 +1112,10 @@ static void mm256_vectors_swap(__m256i *a0, __m256i *a1, const int size) {
}
static void mm256_transpose_32x32(__m256i *in0, __m256i *in1) {
mm256_transpose_16x16(in0);
mm256_transpose_16x16(&in0[16]);
mm256_transpose_16x16(in1);
mm256_transpose_16x16(&in1[16]);
mm256_transpose_16x16(in0, in0);
mm256_transpose_16x16(&in0[16], &in0[16]);
mm256_transpose_16x16(in1, in1);
mm256_transpose_16x16(&in1[16], &in1[16]);
mm256_vectors_swap(&in0[16], in1, 16);
}
......@@ -1464,7 +1466,7 @@ static INLINE void write_buffer_32x32(const __m256i *in0, const __m256i *in1,
static void fhalfright32_16col_avx2(__m256i *in) {
int i = 0;
const __m256i zero = _mm256_setzero_si256();
const __m256i sqrt2 = _mm256_set1_epi16(Sqrt2);
const __m256i sqrt2 = _mm256_set1_epi16((int16_t)Sqrt2);
const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
__m256i x0, x1;
......
......@@ -428,6 +428,14 @@ const PartialInvTxfmParam avx2_partial_idct_tests[] = {
&wrapper<aom_idct16x16_10_add_avx2>, TX_16X16, 10, 8, 1),
make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
&wrapper<aom_idct16x16_1_add_avx2>, TX_16X16, 1, 8, 1),
make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
&wrapper<aom_idct32x32_1024_add_avx2>, TX_32X32, 1024, 8, 1),
make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
&wrapper<aom_idct32x32_135_add_avx2>, TX_32X32, 135, 8, 1),
make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
&wrapper<aom_idct32x32_34_add_avx2>, TX_32X32, 34, 8, 1),
make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
&wrapper<aom_idct32x32_1_add_avx2>, TX_32X32, 1, 8, 1),
};
INSTANTIATE_TEST_CASE_P(AVX2, PartialIDctTest,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment