Commit 6ae0054c authored by Yi Luo's avatar Yi Luo

Highbd loop filter AVX2

- Speed test (ms) on i7-6700, Linux x86_64
  FUNCTION             SSE2    AVX2
  horizontal_edge_16   55      28
  vertical_16_dual     84      47
  horizontal_4_dual    27      13
  horizontal_8_dual    36      15
  vertical_4_dual      38      25
  vertical_8_dual      44      27
- Decoder frame rate improves around 1.2% - 2.8%.

Change-Id: I9c4123869bac9b6d32e626173c2a8e7eb0cf49e7
parent ea71de8a
......@@ -46,6 +46,7 @@ set(AOM_DSP_COMMON_INTRIN_SSE2
"${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c"
"${AOM_ROOT}/aom_dsp/x86/convolve.h"
"${AOM_ROOT}/aom_dsp/x86/txfm_common_sse2.h"
"${AOM_ROOT}/aom_dsp/x86/lpf_common_sse2.h"
"${AOM_ROOT}/aom_dsp/x86/loopfilter_sse2.c")
set(AOM_DSP_COMMON_ASM_SSSE3
......@@ -66,6 +67,7 @@ set(AOM_DSP_COMMON_INTRIN_AVX2
"${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c"
"${AOM_ROOT}/aom_dsp/x86/loopfilter_avx2.c"
"${AOM_ROOT}/aom_dsp/x86/inv_txfm_avx2.c"
"${AOM_ROOT}/aom_dsp/x86/common_avx2.h"
"${AOM_ROOT}/aom_dsp/x86/inv_txfm_common_avx2.h"
"${AOM_ROOT}/aom_dsp/x86/txfm_common_avx2.h")
......@@ -189,7 +191,8 @@ if (CONFIG_HIGHBITDEPTH)
set(AOM_DSP_COMMON_INTRIN_AVX2
${AOM_DSP_COMMON_INTRIN_AVX2}
"${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c")
"${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c"
"${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c")
else ()
set(AOM_DSP_COMMON_INTRIN_DSPR2
${AOM_DSP_COMMON_INTRIN_DSPR2}
......
......@@ -165,6 +165,7 @@ DSP_SRCS-yes += loopfilter.c
DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/loopfilter_sse2.c
DSP_SRCS-$(HAVE_AVX2) += x86/loopfilter_avx2.c
DSP_SRCS-$(HAVE_SSE2) += x86/lpf_common_sse2.h
DSP_SRCS-$(HAVE_NEON) += arm/loopfilter_neon.c
ifeq ($(HAVE_NEON_ASM),yes)
......@@ -194,10 +195,12 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_vert_dspr2.c
ifeq ($(CONFIG_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_loopfilter_sse2.c
DSP_SRCS-$(HAVE_AVX2) += x86/highbd_loopfilter_avx2.c
endif # CONFIG_HIGHBITDEPTH
DSP_SRCS-yes += txfm_common.h
DSP_SRCS-yes += x86/txfm_common_intrin.h
DSP_SRCS-$(HAVE_AVX2) += x86/common_avx2.h
DSP_SRCS-$(HAVE_SSE2) += x86/txfm_common_sse2.h
DSP_SRCS-$(HAVE_SSSE3) += x86/obmc_intrinsic_ssse3.h
DSP_SRCS-$(HAVE_MSA) += mips/txfm_macros_msa.h
......
......@@ -305,37 +305,37 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
specialize qw/aom_highbd_lpf_vertical_16 sse2/;
add_proto qw/void aom_highbd_lpf_vertical_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
specialize qw/aom_highbd_lpf_vertical_16_dual sse2/;
specialize qw/aom_highbd_lpf_vertical_16_dual sse2 avx2/;
add_proto qw/void aom_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
specialize qw/aom_highbd_lpf_vertical_8 sse2/;
add_proto qw/void aom_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
specialize qw/aom_highbd_lpf_vertical_8_dual sse2/;
specialize qw/aom_highbd_lpf_vertical_8_dual sse2 avx2/;
add_proto qw/void aom_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
specialize qw/aom_highbd_lpf_vertical_4 sse2/;
add_proto qw/void aom_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
specialize qw/aom_highbd_lpf_vertical_4_dual sse2/;
specialize qw/aom_highbd_lpf_vertical_4_dual sse2 avx2/;
add_proto qw/void aom_highbd_lpf_horizontal_edge_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
specialize qw/aom_highbd_lpf_horizontal_edge_8 sse2/;
add_proto qw/void aom_highbd_lpf_horizontal_edge_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
specialize qw/aom_highbd_lpf_horizontal_edge_16 sse2/;
specialize qw/aom_highbd_lpf_horizontal_edge_16 sse2 avx2/;
add_proto qw/void aom_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
specialize qw/aom_highbd_lpf_horizontal_8 sse2/;
add_proto qw/void aom_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
specialize qw/aom_highbd_lpf_horizontal_8_dual sse2/;
specialize qw/aom_highbd_lpf_horizontal_8_dual sse2 avx2/;
add_proto qw/void aom_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
specialize qw/aom_highbd_lpf_horizontal_4 sse2/;
add_proto qw/void aom_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
specialize qw/aom_highbd_lpf_horizontal_4_dual sse2/;
specialize qw/aom_highbd_lpf_horizontal_4_dual sse2 avx2/;
} # CONFIG_HIGHBITDEPTH
#
......
/*
* Copyright (c) 2017, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_DSP_X86_COMMON_AVX2_H
#define AOM_DSP_X86_COMMON_AVX2_H
#include <immintrin.h>
#include "./aom_config.h"
// Note: in and out could have the same value
static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) {
__m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
__m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]);
__m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]);
__m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
__m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
__m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]);
__m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]);
__m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
__m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]);
__m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]);
__m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]);
__m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]);
__m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]);
__m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]);
__m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]);
__m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]);
// 00 10 01 11 02 12 03 13 08 18 09 19 0a 1a 0b 1b
// 04 14 05 15 06 16 07 17 0c 1c 0d 1d 0e 1e 0f 1f
// 20 30 21 31 22 32 23 33 28 38 29 39 2a 3a 2b 3b
// 24 34 25 35 26 36 27 37 2c 3c 2d 3d 2e 3e 2f 3f
// 40 50 41 51 42 52 43 53 48 58 49 59 4a 5a 4b 5b
// 44 54 45 55 46 56 47 57 4c 5c 4d 5d 4e 5e 4f 5f
// 60 70 61 71 62 72 63 73 68 78 69 79 6a 7a 6b 7b
// 64 74 65 75 66 76 67 77 6c 7c 6d 7d 6e 7e 6f 7f
// 80 90 81 91 82 92 83 93 88 98 89 99 8a 9a 8b 9b
// 84 94 85 95 86 96 87 97 8c 9c 8d 9d 8e 9e 8f 9f
// a0 b0 a1 b1 a2 b2 a3 b3 a8 b8 a9 b9 aa ba ab bb
// a4 b4 a5 b5 a6 b6 a7 b7 ac bc ad bd ae be af bf
// c0 d0 c1 d1 c2 d2 c3 d3 c8 d8 c9 d9 ca da cb db
// c4 d4 c5 d5 c6 d6 c7 d7 cc dc cd dd ce de cf df
// e0 f0 e1 f1 e2 f2 e3 f3 e8 f8 e9 f9 ea fa eb fb
// e4 f4 e5 f5 e6 f6 e7 f7 ec fc ed fd ee fe ef ff
__m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2);
__m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2);
__m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3);
__m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3);
__m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6);
__m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6);
__m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7);
__m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7);
__m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a);
__m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a);
__m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b);
__m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b);
__m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e);
__m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e);
__m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f);
__m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f);
// 00 10 20 30 01 11 21 31 08 18 28 38 09 19 29 39
// 02 12 22 32 03 13 23 33 0a 1a 2a 3a 0b 1b 2b 3b
// 04 14 24 34 05 15 25 35 0c 1c 2c 3c 0d 1d 2d 3d
// 06 16 26 36 07 17 27 37 0e 1e 2e 3e 0f 1f 2f 3f
// 40 50 60 70 41 51 61 71 48 58 68 78 49 59 69 79
// 42 52 62 72 43 53 63 73 4a 5a 6a 7a 4b 5b 6b 7b
// 44 54 64 74 45 55 65 75 4c 5c 6c 7c 4d 5d 6d 7d
// 46 56 66 76 47 57 67 77 4e 5e 6e 7e 4f 5f 6f 7f
// 80 90 a0 b0 81 91 a1 b1 88 98 a8 b8 89 99 a9 b9
// 82 92 a2 b2 83 93 a3 b3 8a 9a aa ba 8b 9b ab bb
// 84 94 a4 b4 85 95 a5 b5 8c 9c ac bc 8d 9d ad bd
// 86 96 a6 b6 87 97 a7 b7 8e ae 9e be 8f 9f af bf
// c0 d0 e0 f0 c1 d1 e1 f1 c8 d8 e8 f8 c9 d9 e9 f9
// c2 d2 e2 f2 c3 d3 e3 f3 ca da ea fa cb db eb fb
// c4 d4 e4 f4 c5 d5 e5 f5 cc dc ef fc cd dd ed fd
// c6 d6 e6 f6 c7 d7 e7 f7 ce de ee fe cf df ef ff
tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c);
tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c);
tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d);
tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d);
tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e);
tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e);
tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f);
tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f);
// 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78
// 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79
// 02 12 22 32 42 52 62 72 0a 1a 2a 3a 4a 5a 6a 7a
// 03 13 23 33 43 53 63 73 0b 1b 2b 3b 4b 5b 6b 7b
// 04 14 24 34 44 54 64 74 0c 1c 2c 3c 4c 5c 6c 7c
// 05 15 25 35 45 55 65 75 0d 1d 2d 3d 4d 5d 6d 7d
// 06 16 26 36 46 56 66 76 0e 1e 2e 3e 4e 5e 6e 7e
// 07 17 27 37 47 57 67 77 0f 1f 2f 3f 4f 5f 6f 7f
// 80 90 a0 b0 c0 d0 e0 f0 88 98 a8 b8 c8 d8 e8 f8
// 81 91 a1 b1 c1 d1 e1 f1 89 99 a9 b9 c9 d9 e9 f9
// 82 92 a2 b2 c2 d2 e2 f2 8a 9a aa ba ca da ea fa
// 83 93 a3 b3 c3 d3 e3 f3 8b 9b ab bb cb db eb fb
// 84 94 a4 b4 c4 d4 e4 f4 8c 9c ac bc cc dc ef fc
// 85 95 a5 b5 c5 d5 e5 f5 8d 9d ad bd cd dd ed fd
// 86 96 a6 b6 c6 d6 e6 f6 8e ae 9e be ce de ee fe
// 87 97 a7 b7 c7 d7 e7 f7 8f 9f af bf cf df ef ff
out[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20); // 0010 0000
out[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31); // 0011 0001
out[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20);
out[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31);
out[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20);
out[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31);
out[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20);
out[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31);
out[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20);
out[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31);
out[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20);
out[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31);
out[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20);
out[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31);
out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20);
out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
}
#endif
This diff is collapsed.
......@@ -12,6 +12,7 @@
#include <emmintrin.h> // SSE2
#include "./aom_dsp_rtcd.h"
#include "aom_dsp/x86/lpf_common_sse2.h"
#include "aom_ports/mem.h"
#include "aom_ports/emmintrin_compat.h"
......@@ -888,118 +889,6 @@ void aom_highbd_lpf_horizontal_4_dual_sse2(
aom_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
}
static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[],
int out_p, int num_8x8_to_transpose) {
int idx8x8 = 0;
__m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7;
do {
uint16_t *in = src[idx8x8];
uint16_t *out = dst[idx8x8];
p0 =
_mm_loadu_si128((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07
p1 =
_mm_loadu_si128((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17
p2 =
_mm_loadu_si128((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27
p3 =
_mm_loadu_si128((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37
p4 =
_mm_loadu_si128((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47
p5 =
_mm_loadu_si128((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57
p6 =
_mm_loadu_si128((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67
p7 =
_mm_loadu_si128((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77
// 00 10 01 11 02 12 03 13
x0 = _mm_unpacklo_epi16(p0, p1);
// 20 30 21 31 22 32 23 33
x1 = _mm_unpacklo_epi16(p2, p3);
// 40 50 41 51 42 52 43 53
x2 = _mm_unpacklo_epi16(p4, p5);
// 60 70 61 71 62 72 63 73
x3 = _mm_unpacklo_epi16(p6, p7);
// 00 10 20 30 01 11 21 31
x4 = _mm_unpacklo_epi32(x0, x1);
// 40 50 60 70 41 51 61 71
x5 = _mm_unpacklo_epi32(x2, x3);
// 00 10 20 30 40 50 60 70
x6 = _mm_unpacklo_epi64(x4, x5);
// 01 11 21 31 41 51 61 71
x7 = _mm_unpackhi_epi64(x4, x5);
_mm_storeu_si128((__m128i *)(out + 0 * out_p), x6);
// 00 10 20 30 40 50 60 70
_mm_storeu_si128((__m128i *)(out + 1 * out_p), x7);
// 01 11 21 31 41 51 61 71
// 02 12 22 32 03 13 23 33
x4 = _mm_unpackhi_epi32(x0, x1);
// 42 52 62 72 43 53 63 73
x5 = _mm_unpackhi_epi32(x2, x3);
// 02 12 22 32 42 52 62 72
x6 = _mm_unpacklo_epi64(x4, x5);
// 03 13 23 33 43 53 63 73
x7 = _mm_unpackhi_epi64(x4, x5);
_mm_storeu_si128((__m128i *)(out + 2 * out_p), x6);
// 02 12 22 32 42 52 62 72
_mm_storeu_si128((__m128i *)(out + 3 * out_p), x7);
// 03 13 23 33 43 53 63 73
// 04 14 05 15 06 16 07 17
x0 = _mm_unpackhi_epi16(p0, p1);
// 24 34 25 35 26 36 27 37
x1 = _mm_unpackhi_epi16(p2, p3);
// 44 54 45 55 46 56 47 57
x2 = _mm_unpackhi_epi16(p4, p5);
// 64 74 65 75 66 76 67 77
x3 = _mm_unpackhi_epi16(p6, p7);
// 04 14 24 34 05 15 25 35
x4 = _mm_unpacklo_epi32(x0, x1);
// 44 54 64 74 45 55 65 75
x5 = _mm_unpacklo_epi32(x2, x3);
// 04 14 24 34 44 54 64 74
x6 = _mm_unpacklo_epi64(x4, x5);
// 05 15 25 35 45 55 65 75
x7 = _mm_unpackhi_epi64(x4, x5);
_mm_storeu_si128((__m128i *)(out + 4 * out_p), x6);
// 04 14 24 34 44 54 64 74
_mm_storeu_si128((__m128i *)(out + 5 * out_p), x7);
// 05 15 25 35 45 55 65 75
// 06 16 26 36 07 17 27 37
x4 = _mm_unpackhi_epi32(x0, x1);
// 46 56 66 76 47 57 67 77
x5 = _mm_unpackhi_epi32(x2, x3);
// 06 16 26 36 46 56 66 76
x6 = _mm_unpacklo_epi64(x4, x5);
// 07 17 27 37 47 57 67 77
x7 = _mm_unpackhi_epi64(x4, x5);
_mm_storeu_si128((__m128i *)(out + 6 * out_p), x6);
// 06 16 26 36 46 56 66 76
_mm_storeu_si128((__m128i *)(out + 7 * out_p), x7);
// 07 17 27 37 47 57 67 77
} while (++idx8x8 < num_8x8_to_transpose);
}
static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p,
uint16_t *out, int out_p) {
uint16_t *src0[1];
uint16_t *src1[1];
uint16_t *dest0[1];
uint16_t *dest1[1];
src0[0] = in0;
src1[0] = in1;
dest0[0] = out;
dest1[0] = out + 8;
highbd_transpose(src0, in_p, dest0, out_p, 1);
highbd_transpose(src1, in_p, dest1, out_p, 1);
}
void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh,
int bd) {
......
/*
* Copyright (c) 2017, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef _AOM_DSP_X86_LPF_COMMON_X86_H
#define _AOM_DSP_X86_LPF_COMMON_X86_H
#include <emmintrin.h> // SSE2
#include "./aom_config.h"
static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[],
int out_p, int num_8x8_to_transpose) {
int idx8x8 = 0;
__m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7;
do {
uint16_t *in = src[idx8x8];
uint16_t *out = dst[idx8x8];
p0 =
_mm_loadu_si128((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07
p1 =
_mm_loadu_si128((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17
p2 =
_mm_loadu_si128((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27
p3 =
_mm_loadu_si128((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37
p4 =
_mm_loadu_si128((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47
p5 =
_mm_loadu_si128((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57
p6 =
_mm_loadu_si128((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67
p7 =
_mm_loadu_si128((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77
// 00 10 01 11 02 12 03 13
x0 = _mm_unpacklo_epi16(p0, p1);
// 20 30 21 31 22 32 23 33
x1 = _mm_unpacklo_epi16(p2, p3);
// 40 50 41 51 42 52 43 53
x2 = _mm_unpacklo_epi16(p4, p5);
// 60 70 61 71 62 72 63 73
x3 = _mm_unpacklo_epi16(p6, p7);
// 00 10 20 30 01 11 21 31
x4 = _mm_unpacklo_epi32(x0, x1);
// 40 50 60 70 41 51 61 71
x5 = _mm_unpacklo_epi32(x2, x3);
// 00 10 20 30 40 50 60 70
x6 = _mm_unpacklo_epi64(x4, x5);
// 01 11 21 31 41 51 61 71
x7 = _mm_unpackhi_epi64(x4, x5);
_mm_storeu_si128((__m128i *)(out + 0 * out_p), x6);
// 00 10 20 30 40 50 60 70
_mm_storeu_si128((__m128i *)(out + 1 * out_p), x7);
// 01 11 21 31 41 51 61 71
// 02 12 22 32 03 13 23 33
x4 = _mm_unpackhi_epi32(x0, x1);
// 42 52 62 72 43 53 63 73
x5 = _mm_unpackhi_epi32(x2, x3);
// 02 12 22 32 42 52 62 72
x6 = _mm_unpacklo_epi64(x4, x5);
// 03 13 23 33 43 53 63 73
x7 = _mm_unpackhi_epi64(x4, x5);
_mm_storeu_si128((__m128i *)(out + 2 * out_p), x6);
// 02 12 22 32 42 52 62 72
_mm_storeu_si128((__m128i *)(out + 3 * out_p), x7);
// 03 13 23 33 43 53 63 73
// 04 14 05 15 06 16 07 17
x0 = _mm_unpackhi_epi16(p0, p1);
// 24 34 25 35 26 36 27 37
x1 = _mm_unpackhi_epi16(p2, p3);
// 44 54 45 55 46 56 47 57
x2 = _mm_unpackhi_epi16(p4, p5);
// 64 74 65 75 66 76 67 77
x3 = _mm_unpackhi_epi16(p6, p7);
// 04 14 24 34 05 15 25 35
x4 = _mm_unpacklo_epi32(x0, x1);
// 44 54 64 74 45 55 65 75
x5 = _mm_unpacklo_epi32(x2, x3);
// 04 14 24 34 44 54 64 74
x6 = _mm_unpacklo_epi64(x4, x5);
// 05 15 25 35 45 55 65 75
x7 = _mm_unpackhi_epi64(x4, x5);
_mm_storeu_si128((__m128i *)(out + 4 * out_p), x6);
// 04 14 24 34 44 54 64 74
_mm_storeu_si128((__m128i *)(out + 5 * out_p), x7);
// 05 15 25 35 45 55 65 75
// 06 16 26 36 07 17 27 37
x4 = _mm_unpackhi_epi32(x0, x1);
// 46 56 66 76 47 57 67 77
x5 = _mm_unpackhi_epi32(x2, x3);
// 06 16 26 36 46 56 66 76
x6 = _mm_unpacklo_epi64(x4, x5);
// 07 17 27 37 47 57 67 77
x7 = _mm_unpackhi_epi64(x4, x5);
_mm_storeu_si128((__m128i *)(out + 6 * out_p), x6);
// 06 16 26 36 46 56 66 76
_mm_storeu_si128((__m128i *)(out + 7 * out_p), x7);
// 07 17 27 37 47 57 67 77
} while (++idx8x8 < num_8x8_to_transpose);
}
static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p,
uint16_t *out, int out_p) {
uint16_t *src0[1];
uint16_t *src1[1];
uint16_t *dest0[1];
uint16_t *dest1[1];
src0[0] = in0;
src1[0] = in1;
dest0[0] = out;
dest1[0] = out + 8;
highbd_transpose(src0, in_p, dest0, out_p, 1);
highbd_transpose(src1, in_p, dest1, out_p, 1);
}
#endif // _AOM_DSP_X86_LPF_COMMON_X86_H
......@@ -15,6 +15,7 @@
#include <immintrin.h>
#include "aom_dsp/txfm_common.h"
#include "aom_dsp/x86/common_avx2.h"
#define pair256_set_epi16(a, b) \
_mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
......@@ -34,135 +35,6 @@ static INLINE void mm256_reverse_epi16(__m256i *u) {
*u = _mm256_permute2x128_si256(v, v, 1);
}
// Note: in and out could have the same value
static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) {
__m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
__m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]);
__m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]);
__m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
__m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
__m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]);
__m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]);
__m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
__m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]);
__m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]);
__m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]);
__m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]);
__m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]);
__m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]);
__m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]);
__m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]);
// 00 10 01 11 02 12 03 13 08 18 09 19 0a 1a 0b 1b
// 04 14 05 15 06 16 07 17 0c 1c 0d 1d 0e 1e 0f 1f
// 20 30 21 31 22 32 23 33 28 38 29 39 2a 3a 2b 3b
// 24 34 25 35 26 36 27 37 2c 3c 2d 3d 2e 3e 2f 3f
// 40 50 41 51 42 52 43 53 48 58 49 59 4a 5a 4b 5b
// 44 54 45 55 46 56 47 57 4c 5c 4d 5d 4e 5e 4f 5f
// 60 70 61 71 62 72 63 73 68 78 69 79 6a 7a 6b 7b
// 64 74 65 75 66 76 67 77 6c 7c 6d 7d 6e 7e 6f 7f
// 80 90 81 91 82 92 83 93 88 98 89 99 8a 9a 8b 9b
// 84 94 85 95 86 96 87 97 8c 9c 8d 9d 8e 9e 8f 9f
// a0 b0 a1 b1 a2 b2 a3 b3 a8 b8 a9 b9 aa ba ab bb
// a4 b4 a5 b5 a6 b6 a7 b7 ac bc ad bd ae be af bf
// c0 d0 c1 d1 c2 d2 c3 d3 c8 d8 c9 d9 ca da cb db
// c4 d4 c5 d5 c6 d6 c7 d7 cc dc cd dd ce de cf df
// e0 f0 e1 f1 e2 f2 e3 f3 e8 f8 e9 f9 ea fa eb fb
// e4 f4 e5 f5 e6 f6 e7 f7 ec fc ed fd ee fe ef ff
__m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2);
__m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2);
__m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3);
__m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3);
__m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6);
__m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6);
__m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7);
__m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7);
__m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a);
__m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a);
__m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b);
__m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b);
__m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e);
__m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e);
__m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f);
__m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f);
// 00 10 20 30 01 11 21 31 08 18 28 38 09 19 29 39
// 02 12 22 32 03 13 23 33 0a 1a 2a 3a 0b 1b 2b 3b
// 04 14 24 34 05 15 25 35 0c 1c 2c 3c 0d 1d 2d 3d
// 06 16 26 36 07 17 27 37 0e 1e 2e 3e 0f 1f 2f 3f
// 40 50 60 70 41 51 61 71 48 58 68 78 49 59 69 79
// 42 52 62 72 43 53 63 73 4a 5a 6a 7a 4b 5b 6b 7b
// 44 54 64 74 45 55 65 75 4c 5c 6c 7c 4d 5d 6d 7d
// 46 56 66 76 47 57 67 77 4e 5e 6e 7e 4f 5f 6f 7f
// 80 90 a0 b0 81 91 a1 b1 88 98 a8 b8 89 99 a9 b9
// 82 92 a2 b2 83 93 a3 b3 8a 9a aa ba 8b 9b ab bb
// 84 94 a4 b4 85 95 a5 b5 8c 9c ac bc 8d 9d ad bd
// 86 96 a6 b6 87 97 a7 b7 8e ae 9e be 8f 9f af bf
// c0 d0 e0 f0 c1 d1 e1 f1 c8 d8 e8 f8 c9 d9 e9 f9
// c2 d2 e2 f2 c3 d3 e3 f3 ca da ea fa cb db eb fb
// c4 d4 e4 f4 c5 d5 e5 f5 cc dc ef fc cd dd ed fd
// c6 d6 e6 f6 c7 d7 e7 f7 ce de ee fe cf df ef ff
tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c);
tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c);
tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d);
tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d);
tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e);
tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e);
tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f);
tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f);
// 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78
// 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79
// 02 12 22 32 42 52 62 72 0a 1a 2a 3a 4a 5a 6a 7a
// 03 13 23 33 43 53 63 73 0b 1b 2b 3b 4b 5b 6b 7b
// 04 14 24 34 44 54 64 74 0c 1c 2c 3c 4c 5c 6c 7c
// 05 15 25 35 45 55 65 75 0d 1d 2d 3d 4d 5d 6d 7d
// 06 16 26 36 46 56 66 76 0e 1e 2e 3e 4e 5e 6e 7e
// 07 17 27 37 47 57 67 77 0f 1f 2f 3f 4f 5f 6f 7f
// 80 90 a0 b0 c0 d0 e0 f0 88 98 a8 b8 c8 d8 e8 f8
// 81 91 a1 b1 c1 d1 e1 f1 89 99 a9 b9 c9 d9 e9 f9
// 82 92 a2 b2 c2 d2 e2 f2 8a 9a aa ba ca da ea fa
// 83 93 a3 b3 c3 d3 e3 f3 8b 9b ab bb cb db eb fb
// 84 94 a4 b4 c4 d4 e4 f4 8c 9c ac bc cc dc ef fc
// 85 95 a5 b5 c5 d5 e5 f5 8d 9d ad bd cd dd ed fd
// 86 96 a6 b6 c6 d6 e6 f6 8e ae 9e be ce de ee fe
// 87 97 a7 b7 c7 d7 e7 f7 8f 9f af bf cf df ef ff
out[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20); // 0010 0000
out[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31); // 0011 0001
out[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20);
out[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31);
out[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20);
out[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31);
out[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20);
out[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31);
out[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20);
out[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31);
out[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20);
out[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31);
out[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20);
out[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31);
out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20);
out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
}
static INLINE __m256i butter_fly(const __m256i *a0, const __m256i *a1,
const __m256i *cospi) {
const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
......
......@@ -35,6 +35,8 @@ const int kNumCoeffs = 1024;
const int number_of_iterations = 10000;
const int kSpeedTestNum = 500000;
#if CONFIG_HIGHBITDEPTH
typedef void (*loop_op_t)(uint16_t *s, int p, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh, int bd);
......@@ -242,6 +244,43 @@ TEST_P(Loop8Test6Param, ValueCheck) {
<< "First failed at test case " << first_failure;
}
TEST_P(Loop8Test6Param, DISABLED_Speed) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
const int count_test_block = kSpeedTestNum;
#if CONFIG_HIGHBITDEPTH
const int32_t bd = bit_depth_;
DECLARE_ALIGNED(16, uint16_t, s[kNumCoeffs]);
#else
DECLARE_ALIGNED(8, uint8_t, s[kNumCoeffs]);
#endif // CONFIG_HIGHBITDEPTH
uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
DECLARE_ALIGNED(16, const uint8_t,