From 6d0ed3ed38538c94171c4484733ee54807b61fab Mon Sep 17 00:00:00 2001 From: Yaowu Xu Date: Mon, 12 Feb 2018 10:38:16 -0800 Subject: [PATCH] Remove CONFIG_PARALLEL_DEBLOCKING The experiment is fully adopted now. Change-Id: I27906d2af4c746ce55aa17f64d1c0ef281e23ab2 --- aom_dsp/aom_dsp.cmake | 47 -- aom_dsp/aom_dsp_rtcd_defs.pl | 119 ++-- aom_dsp/loopfilter.c | 150 +---- aom_dsp/x86/highbd_loopfilter_avx2.c | 811 -------------------------- aom_dsp/x86/highbd_loopfilter_sse2.c | 59 -- aom_dsp/x86/loopfilter_sse2.c | 128 ---- av1/common/av1_loopfilter.c | 27 - av1/common/thread_common.c | 111 +--- build/cmake/aom_config_defaults.cmake | 1 - build/cmake/aom_experiment_deps.cmake | 3 - test/lpf_test.cc | 22 - 11 files changed, 44 insertions(+), 1434 deletions(-) diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake index aacc04510..85362d077 100644 --- a/aom_dsp/aom_dsp.cmake +++ b/aom_dsp/aom_dsp.cmake @@ -86,12 +86,6 @@ set(AOM_DSP_COMMON_INTRIN_AVX2 "${AOM_ROOT}/aom_dsp/x86/inv_txfm_common_avx2.h" "${AOM_ROOT}/aom_dsp/x86/txfm_common_avx2.h") -if (NOT CONFIG_PARALLEL_DEBLOCKING) - set(AOM_DSP_COMMON_INTRIN_AVX2 - ${AOM_DSP_COMMON_INTRIN_AVX2} - "${AOM_ROOT}/aom_dsp/x86/loopfilter_avx2.c") -endif () - if (NOT CONFIG_EXT_PARTITION) set(AOM_DSP_COMMON_ASM_NEON "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm" @@ -113,14 +107,6 @@ set(AOM_DSP_COMMON_ASM_NEON "${AOM_ROOT}/aom_dsp/arm/intrapred_neon_asm.asm" "${AOM_ROOT}/aom_dsp/arm/save_reg_neon.asm") -if (NOT CONFIG_PARALLEL_DEBLOCKING) - set(AOM_DSP_COMMON_ASM_NEON - ${AOM_DSP_COMMON_ASM_NEON} - "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.asm" - "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.asm" - "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.asm" - "${AOM_ROOT}/aom_dsp/arm/loopfilter_mb_neon.asm") -endif () if (NOT CONFIG_EXT_PARTITION) set(AOM_DSP_COMMON_INTRIN_NEON @@ -140,11 +126,6 @@ set(AOM_DSP_COMMON_INTRIN_NEON "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c" "${AOM_ROOT}/aom_dsp/arm/variance_neon.c") -if (NOT CONFIG_PARALLEL_DEBLOCKING) - set(AOM_DSP_COMMON_INTRIN_NEON - ${AOM_DSP_COMMON_INTRIN_NEON} - "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c") -endif () if ("${AOM_TARGET_CPU}" STREQUAL "arm64") if (NOT CONFIG_EXT_PARTITION) @@ -168,13 +149,6 @@ if ("${AOM_TARGET_CPU}" STREQUAL "arm64") "${AOM_ROOT}/aom_dsp/arm/idct8x8_add_neon.c" "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c") - if (NOT CONFIG_PARALLEL_DEBLOCKING) - set(AOM_DSP_COMMON_INTRIN_NEON - ${AOM_DSP_COMMON_INTRIN_NEON} - "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.c" - "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.c" - "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.c") - endif () endif () set(AOM_DSP_COMMON_INTRIN_DSPR2 @@ -196,18 +170,6 @@ set(AOM_DSP_COMMON_INTRIN_DSPR2 "${AOM_ROOT}/aom_dsp/mips/intrapred8_dspr2.c" "${AOM_ROOT}/aom_dsp/mips/inv_txfm_dspr2.h") -if (NOT CONFIG_PARALLEL_DEBLOCKING) - set(AOM_DSP_COMMON_INTRIN_DSPR2 - ${AOM_DSP_COMMON_INTRIN_DSPR2} - "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.h" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_macros_dspr2.h" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_masks_dspr2.h" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_vert_dspr2.c") -endif () - set(AOM_DSP_COMMON_INTRIN_MSA "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c" "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_avg_msa.c" @@ -230,15 +192,6 @@ set(AOM_DSP_COMMON_INTRIN_MSA "${AOM_ROOT}/aom_dsp/mips/macros_msa.h" "${AOM_ROOT}/aom_dsp/mips/txfm_macros_msa.h") -if (NOT CONFIG_PARALLEL_DEBLOCKING) - set(AOM_DSP_COMMON_INTRIN_MSA - ${AOM_DSP_COMMON_INTRIN_MSA} - "${AOM_ROOT}/aom_dsp/mips/loopfilter_16_msa.c" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_4_msa.c" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_8_msa.c" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_msa.h") -endif () - set(AOM_DSP_COMMON_ASM_SSE2 ${AOM_DSP_COMMON_ASM_SSE2} "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm" diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl index 326fd6caf..9446d5781 100755 --- a/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/aom_dsp/aom_dsp_rtcd_defs.pl @@ -432,130 +432,77 @@ if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") { # Loopfilter # add_proto qw/void aom_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") { - specialize qw/aom_lpf_vertical_16 sse2/; -} else { - specialize qw/aom_lpf_vertical_16 sse2 neon_asm dspr2 msa/; - $aom_lpf_vertical_16_neon_asm=aom_lpf_vertical_16_neon; -} +specialize qw/aom_lpf_vertical_16 sse2/; add_proto qw/void aom_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") { - specialize qw/aom_lpf_vertical_16_dual sse2 neon_asm dspr2 msa/; - $aom_lpf_vertical_16_dual_neon_asm=aom_lpf_vertical_16_dual_neon; -} add_proto qw/void aom_lpf_vertical_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") { - specialize qw/aom_lpf_vertical_6 sse2/; -} +specialize qw/aom_lpf_vertical_6 sse2/; add_proto qw/void aom_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") { - specialize qw/aom_lpf_vertical_8 sse2/; -} else { - specialize qw/aom_lpf_vertical_8 sse2 neon dspr2 msa/; -} +specialize qw/aom_lpf_vertical_8 sse2/; add_proto qw/void aom_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") { - specialize qw/aom_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/; - $aom_lpf_vertical_8_dual_neon_asm=aom_lpf_vertical_8_dual_neon; -} add_proto qw/void aom_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") { - specialize qw/aom_lpf_vertical_4 sse2/; -} else { - specialize qw/aom_lpf_vertical_4 sse2 neon dspr2 msa/; -} +specialize qw/aom_lpf_vertical_4 sse2/; add_proto qw/void aom_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") { - specialize qw/aom_lpf_vertical_4_dual sse2 neon dspr2 msa/; -} add_proto qw/void aom_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") { - specialize qw/aom_lpf_horizontal_16 sse2/; -} else { - specialize qw/aom_lpf_horizontal_16 sse2 avx2 neon_asm dspr2 msa/; - $aom_lpf_horizontal_16_neon_asm=aom_lpf_horizontal_16_neon; -} +specialize qw/aom_lpf_horizontal_16 sse2/; add_proto qw/void aom_lpf_horizontal_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") { - specialize qw/aom_lpf_horizontal_16_dual sse2/; -} else { - specialize qw/aom_lpf_horizontal_16_dual sse2 avx2 neon_asm dspr2 msa/; - $aom_lpf_horizontal_16_dual_neon_asm=aom_lpf_horizontal_16_dual_neon; -} +specialize qw/aom_lpf_horizontal_16_dual sse2/; add_proto qw/void aom_lpf_horizontal_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") { - specialize qw/aom_lpf_horizontal_6 sse2/; -} +specialize qw/aom_lpf_horizontal_6 sse2/; add_proto qw/void aom_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") { - specialize qw/aom_lpf_horizontal_8 sse2/; -} else { - specialize qw/aom_lpf_horizontal_8 sse2 neon dspr2 msa/; -} +specialize qw/aom_lpf_horizontal_8 sse2/; add_proto qw/void aom_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") { - specialize qw/aom_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/; - $aom_lpf_horizontal_8_dual_neon_asm=aom_lpf_horizontal_8_dual_neon; -} add_proto qw/void aom_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") { - specialize qw/aom_lpf_horizontal_4 sse2/; -} else { - specialize qw/aom_lpf_horizontal_4 sse2 neon dspr2 msa/; -} +specialize qw/aom_lpf_horizontal_4 sse2/; add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") { - specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/; -} - add_proto qw/void aom_highbd_lpf_vertical_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; - specialize qw/aom_highbd_lpf_vertical_16 sse2/; +add_proto qw/void aom_highbd_lpf_vertical_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; +specialize qw/aom_highbd_lpf_vertical_16 sse2/; - add_proto qw/void aom_highbd_lpf_vertical_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; - specialize qw/aom_highbd_lpf_vertical_16_dual sse2 avx2/; +add_proto qw/void aom_highbd_lpf_vertical_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; +specialize qw/aom_highbd_lpf_vertical_16_dual sse2 avx2/; - add_proto qw/void aom_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; - specialize qw/aom_highbd_lpf_vertical_8 sse2/; +add_proto qw/void aom_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; +specialize qw/aom_highbd_lpf_vertical_8 sse2/; - add_proto qw/void aom_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; - specialize qw/aom_highbd_lpf_vertical_8_dual sse2 avx2/; +add_proto qw/void aom_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; +specialize qw/aom_highbd_lpf_vertical_8_dual sse2 avx2/; - add_proto qw/void aom_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; - specialize qw/aom_highbd_lpf_vertical_4 sse2/; +add_proto qw/void aom_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; +specialize qw/aom_highbd_lpf_vertical_4 sse2/; - add_proto qw/void aom_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; - specialize qw/aom_highbd_lpf_vertical_4_dual sse2 avx2/; +add_proto qw/void aom_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; +specialize qw/aom_highbd_lpf_vertical_4_dual sse2 avx2/; - add_proto qw/void aom_highbd_lpf_horizontal_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; - specialize qw/aom_highbd_lpf_horizontal_16 sse2/; +add_proto qw/void aom_highbd_lpf_horizontal_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; +specialize qw/aom_highbd_lpf_horizontal_16 sse2/; - add_proto qw/void aom_highbd_lpf_horizontal_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; - specialize qw/aom_highbd_lpf_horizontal_16_dual sse2 avx2/; +add_proto qw/void aom_highbd_lpf_horizontal_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; +specialize qw/aom_highbd_lpf_horizontal_16_dual sse2 avx2/; - add_proto qw/void aom_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; - specialize qw/aom_highbd_lpf_horizontal_8 sse2/; +add_proto qw/void aom_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; +specialize qw/aom_highbd_lpf_horizontal_8 sse2/; - add_proto qw/void aom_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; - specialize qw/aom_highbd_lpf_horizontal_8_dual sse2 avx2/; +add_proto qw/void aom_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; +specialize qw/aom_highbd_lpf_horizontal_8_dual sse2 avx2/; - add_proto qw/void aom_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; - specialize qw/aom_highbd_lpf_horizontal_4 sse2/; +add_proto qw/void aom_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; +specialize qw/aom_highbd_lpf_horizontal_4 sse2/; - add_proto qw/void aom_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; - specialize qw/aom_highbd_lpf_horizontal_4_dual sse2 avx2/; +add_proto qw/void aom_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; +specialize qw/aom_highbd_lpf_horizontal_4_dual sse2 avx2/; # Helper functions. add_proto qw/void av1_round_shift_array/, "int32_t *arr, int size, int bit"; diff --git a/aom_dsp/loopfilter.c b/aom_dsp/loopfilter.c index 6705140e6..252908ba0 100644 --- a/aom_dsp/loopfilter.c +++ b/aom_dsp/loopfilter.c @@ -36,7 +36,7 @@ static INLINE int16_t signed_char_clamp_high(int t, int bd) { default: return (int16_t)clamp(t, -128, 128 - 1); } } -#if CONFIG_PARALLEL_DEBLOCKING + // should we apply any filter at all: 11111111 yes, 00000000 no static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1) { @@ -46,7 +46,7 @@ static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1, mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; return ~mask; } -#endif // CONFIG_PARALLEL_DEBLOCKING + static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3, uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1, uint8_t q2, uint8_t q3) { @@ -156,25 +156,14 @@ void aom_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; -#if CONFIG_PARALLEL_DEBLOCKING int count = 4; -#else - int count = 8; -#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < count; ++i) { -#if !CONFIG_PARALLEL_DEBLOCKING - const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; - const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; - const int8_t mask = - filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); -#else // CONFIG_PARALLEL_DEBLOCKING const uint8_t p1 = s[-2 * p], p0 = s[-p]; const uint8_t q0 = s[0 * p], q1 = s[1 * p]; const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1); -#endif // !CONFIG_PARALLEL_DEBLOCKING filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p); ++s; } @@ -185,35 +174,20 @@ void aom_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { aom_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0); -#if CONFIG_PARALLEL_DEBLOCKING aom_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1); -#else - aom_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1); -#endif } void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; -#if CONFIG_PARALLEL_DEBLOCKING int count = 4; -#else - int count = 8; -#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < count; ++i) { -#if !CONFIG_PARALLEL_DEBLOCKING - const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; - const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; - const int8_t mask = - filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); -#else // CONFIG_PARALLEL_DEBLOCKING const uint8_t p1 = s[-2], p0 = s[-1]; const uint8_t q0 = s[0], q1 = s[1]; const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1); -#endif // !CONFIG_PARALLEL_DEBLOCKING filter4(mask, *thresh, s - 2, s - 1, s, s + 1); s += pitch; } @@ -224,11 +198,7 @@ void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { aom_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0); -#if CONFIG_PARALLEL_DEBLOCKING aom_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1); -#else - aom_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1); -#endif } #if PARALLEL_DEBLOCKING_5_TAP_CHROMA @@ -274,11 +244,7 @@ static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat, void aom_lpf_horizontal_6_c(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; -#if CONFIG_PARALLEL_DEBLOCKING int count = 4; -#else - int count = 8; -#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. @@ -299,11 +265,7 @@ void aom_lpf_horizontal_6_c(uint8_t *s, int p, const uint8_t *blimit, void aom_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; -#if CONFIG_PARALLEL_DEBLOCKING int count = 4; -#else - int count = 8; -#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. @@ -325,22 +287,14 @@ void aom_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { aom_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0); -#if CONFIG_PARALLEL_DEBLOCKING aom_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1); -#else - aom_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1); -#endif } #if PARALLEL_DEBLOCKING_5_TAP_CHROMA void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; -#if CONFIG_PARALLEL_DEBLOCKING int count = 4; -#else - int count = 8; -#endif for (i = 0; i < count; ++i) { const uint8_t p2 = s[-3], p1 = s[-2], p0 = s[-1]; @@ -357,11 +311,7 @@ void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit, void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; -#if CONFIG_PARALLEL_DEBLOCKING int count = 4; -#else - int count = 8; -#endif for (i = 0; i < count; ++i) { const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; @@ -380,11 +330,7 @@ void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { aom_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0); -#if CONFIG_PARALLEL_DEBLOCKING aom_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1); -#else - aom_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1); -#endif } #if PARALLEL_DEBLOCKING_13_TAP @@ -497,11 +443,7 @@ static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count) { int i; -#if CONFIG_PARALLEL_DEBLOCKING int step = 4; -#else - int step = 8; -#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. @@ -581,23 +523,14 @@ static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit, void aom_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { -#if CONFIG_PARALLEL_DEBLOCKING mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4); -#else - mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8); -#endif } void aom_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { -#if CONFIG_PARALLEL_DEBLOCKING mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8); -#else - mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16); -#endif } -#if CONFIG_PARALLEL_DEBLOCKING // Should we apply any filter at all: 11111111 yes, 00000000 no ? static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit, uint16_t p1, uint16_t p0, uint16_t q0, @@ -610,7 +543,6 @@ static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit, mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1; return ~mask; } -#endif // CONFIG_PARALLEL_DEBLOCKING // Should we apply any filter at all: 11111111 yes, 00000000 no ? static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit, @@ -739,34 +671,17 @@ void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { int i; -#if CONFIG_PARALLEL_DEBLOCKING int count = 4; -#else - int count = 8; -#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < count; ++i) { -#if !CONFIG_PARALLEL_DEBLOCKING - const uint16_t p3 = s[-4 * p]; - const uint16_t p2 = s[-3 * p]; - const uint16_t p1 = s[-2 * p]; - const uint16_t p0 = s[-p]; - const uint16_t q0 = s[0 * p]; - const uint16_t q1 = s[1 * p]; - const uint16_t q2 = s[2 * p]; - const uint16_t q3 = s[3 * p]; - const int8_t mask = - highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); -#else // CONFIG_PARALLEL_DEBLOCKING const uint16_t p1 = s[-2 * p]; const uint16_t p0 = s[-p]; const uint16_t q0 = s[0 * p]; const uint16_t q1 = s[1 * p]; const int8_t mask = highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd); -#endif // !CONFIG_PARALLEL_DEBLOCKING highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd); ++s; } @@ -777,37 +692,22 @@ void aom_highbd_lpf_horizontal_4_dual_c( const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { aom_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd); -#if CONFIG_PARALLEL_DEBLOCKING aom_highbd_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1, bd); -#else - aom_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd); -#endif } void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { int i; -#if CONFIG_PARALLEL_DEBLOCKING int count = 4; -#else - int count = 8; -#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < count; ++i) { -#if !CONFIG_PARALLEL_DEBLOCKING - const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; - const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; - const int8_t mask = - highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); -#else // CONFIG_PARALLEL_DEBLOCKING const uint16_t p1 = s[-2], p0 = s[-1]; const uint16_t q0 = s[0], q1 = s[1]; const int8_t mask = highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd); -#endif // !CONFIG_PARALLEL_DEBLOCKING highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd); s += pitch; } @@ -818,13 +718,8 @@ void aom_highbd_lpf_vertical_4_dual_c( const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { aom_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd); -#if CONFIG_PARALLEL_DEBLOCKING aom_highbd_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1, bd); -#else - aom_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1, - bd); -#endif } #if PARALLEL_DEBLOCKING_5_TAP_CHROMA @@ -871,11 +766,7 @@ void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { int i; -#if CONFIG_PARALLEL_DEBLOCKING int count = 4; -#else - int count = 8; -#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. @@ -898,11 +789,7 @@ void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { int i; -#if CONFIG_PARALLEL_DEBLOCKING int count = 4; -#else - int count = 8; -#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. @@ -925,11 +812,7 @@ void aom_highbd_lpf_horizontal_8_dual_c( const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { aom_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd); -#if CONFIG_PARALLEL_DEBLOCKING aom_highbd_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1, bd); -#else - aom_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd); -#endif } #if PARALLEL_DEBLOCKING_5_TAP_CHROMA @@ -937,11 +820,7 @@ void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { int i; -#if CONFIG_PARALLEL_DEBLOCKING int count = 4; -#else - int count = 8; -#endif for (i = 0; i < count; ++i) { const uint16_t p2 = s[-3], p1 = s[-2], p0 = s[-1]; @@ -960,11 +839,7 @@ void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { int i; -#if CONFIG_PARALLEL_DEBLOCKING int count = 4; -#else - int count = 8; -#endif for (i = 0; i < count; ++i) { const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; @@ -984,13 +859,8 @@ void aom_highbd_lpf_vertical_8_dual_c( const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { aom_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd); -#if CONFIG_PARALLEL_DEBLOCKING aom_highbd_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1, bd); -#else - aom_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1, - bd); -#endif } #if PARALLEL_DEBLOCKING_13_TAP @@ -1129,11 +999,7 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p, const uint8_t *thresh, int count, int bd) { int i; -#if CONFIG_PARALLEL_DEBLOCKING int step = 4; -#else - int step = 8; -#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. @@ -1183,11 +1049,7 @@ void aom_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { -#if CONFIG_PARALLEL_DEBLOCKING highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd); -#else - highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd); -#endif } static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p, @@ -1232,20 +1094,12 @@ static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p, void aom_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { -#if CONFIG_PARALLEL_DEBLOCKING highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4, bd); -#else - highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd); -#endif } void aom_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { -#if CONFIG_PARALLEL_DEBLOCKING highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd); -#else - highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd); -#endif } diff --git a/aom_dsp/x86/highbd_loopfilter_avx2.c b/aom_dsp/x86/highbd_loopfilter_avx2.c index b904e09df..412be63b6 100644 --- a/aom_dsp/x86/highbd_loopfilter_avx2.c +++ b/aom_dsp/x86/highbd_loopfilter_avx2.c @@ -16,194 +16,6 @@ #include "aom_dsp/x86/lpf_common_sse2.h" #include "aom/aom_integer.h" -#if !CONFIG_PARALLEL_DEBLOCKING -static INLINE void get_limit(const uint8_t *bl, const uint8_t *l, - const uint8_t *t, int bd, __m256i *blt, - __m256i *lt, __m256i *thr) { - const int shift = bd - 8; - const __m128i zero = _mm_setzero_si128(); - - __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero); - __m256i y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); - *blt = _mm256_slli_epi16(y, shift); - - x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero); - y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); - *lt = _mm256_slli_epi16(y, shift); - - x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero); - y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); - *thr = _mm256_slli_epi16(y, shift); -} - -static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch, - __m256i *p, __m256i *q) { - int i; - for (i = 0; i < size; i++) { - p[i] = _mm256_loadu_si256((__m256i *)(s - (i + 1) * pitch)); - q[i] = _mm256_loadu_si256((__m256i *)(s + i * pitch)); - } -} - -static INLINE void highbd_hev_mask(const __m256i *p, const __m256i *q, - const __m256i *t, __m256i *hev) { - const __m256i abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p[1], p[0])); - const __m256i abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q[1], q[0])); - __m256i h = _mm256_max_epi16(abs_p1p0, abs_q1q0); - h = _mm256_subs_epu16(h, *t); - - const __m256i ffff = _mm256_set1_epi16(0xFFFF); - const __m256i zero = _mm256_setzero_si256(); - *hev = _mm256_xor_si256(_mm256_cmpeq_epi16(h, zero), ffff); -} - -static INLINE void highbd_filter_mask(const __m256i *p, const __m256i *q, - const __m256i *l, const __m256i *bl, - __m256i *mask) { - __m256i abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p[0], q[0])); - __m256i abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p[1], q[1])); - abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1); - - const __m256i zero = _mm256_setzero_si256(); - const __m256i one = _mm256_set1_epi16(1); - const __m256i ffff = _mm256_set1_epi16(0xFFFF); - __m256i max = _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), *bl); - max = _mm256_xor_si256(_mm256_cmpeq_epi16(max, zero), ffff); - max = _mm256_and_si256(max, _mm256_adds_epu16(*l, one)); - - int i; - for (i = 1; i < 4; ++i) { - max = _mm256_max_epi16(max, - _mm256_abs_epi16(_mm256_sub_epi16(p[i], p[i - 1]))); - max = _mm256_max_epi16(max, - _mm256_abs_epi16(_mm256_sub_epi16(q[i], q[i - 1]))); - } - max = _mm256_subs_epu16(max, *l); - *mask = _mm256_cmpeq_epi16(max, zero); // return ~mask -} - -static INLINE void flat_mask_internal(const __m256i *th, const __m256i *p, - const __m256i *q, int bd, int start, - int end, __m256i *flat) { - __m256i max = _mm256_setzero_si256(); - int i; - for (i = start; i < end; ++i) { - max = _mm256_max_epi16(max, _mm256_abs_epi16(_mm256_sub_epi16(p[i], p[0]))); - max = _mm256_max_epi16(max, _mm256_abs_epi16(_mm256_sub_epi16(q[i], q[0]))); - } - - __m256i ft; - if (bd == 8) - ft = _mm256_subs_epu16(max, *th); - else if (bd == 10) - ft = _mm256_subs_epu16(max, _mm256_slli_epi16(*th, 2)); - else // bd == 12 - ft = _mm256_subs_epu16(max, _mm256_slli_epi16(*th, 4)); - - const __m256i zero = _mm256_setzero_si256(); - *flat = _mm256_cmpeq_epi16(ft, zero); -} - -// Note: -// Access p[3-1], p[0], and q[3-1], q[0] -static INLINE void highbd_flat_mask4(const __m256i *th, const __m256i *p, - const __m256i *q, __m256i *flat, int bd) { - // check the distance 1,2,3 against 0 - flat_mask_internal(th, p, q, bd, 1, 4, flat); -} - -// Note: -// access p[7-4], p[0], and q[7-4], q[0] -static INLINE void highbd_flat_mask5(const __m256i *th, const __m256i *p, - const __m256i *q, __m256i *flat, int bd) { - flat_mask_internal(th, p, q, bd, 4, 8, flat); -} - -static INLINE void pixel_clamp(const __m256i *min, const __m256i *max, - __m256i *pixel) { - __m256i clamped, mask; - - mask = _mm256_cmpgt_epi16(*pixel, *max); - clamped = _mm256_andnot_si256(mask, *pixel); - mask = _mm256_and_si256(mask, *max); - clamped = _mm256_or_si256(mask, clamped); - - mask = _mm256_cmpgt_epi16(clamped, *min); - clamped = _mm256_and_si256(mask, clamped); - mask = _mm256_andnot_si256(mask, *min); - *pixel = _mm256_or_si256(clamped, mask); -} - -static INLINE void highbd_filter4(__m256i *p, __m256i *q, const __m256i *mask, - const __m256i *th, int bd, __m256i *ps, - __m256i *qs) { - __m256i t80; - if (bd == 8) - t80 = _mm256_set1_epi16(0x80); - else if (bd == 10) - t80 = _mm256_set1_epi16(0x200); - else // bd == 12 - t80 = _mm256_set1_epi16(0x800); - - __m256i ps0 = _mm256_subs_epi16(p[0], t80); - __m256i ps1 = _mm256_subs_epi16(p[1], t80); - __m256i qs0 = _mm256_subs_epi16(q[0], t80); - __m256i qs1 = _mm256_subs_epi16(q[1], t80); - - const __m256i one = _mm256_set1_epi16(1); - const __m256i pmax = _mm256_subs_epi16( - _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80); - const __m256i zero = _mm256_setzero_si256(); - const __m256i pmin = _mm256_subs_epi16(zero, t80); - - __m256i filter = _mm256_subs_epi16(ps1, qs1); - pixel_clamp(&pmin, &pmax, &filter); - - __m256i hev; - highbd_hev_mask(p, q, th, &hev); - filter = _mm256_and_si256(filter, hev); - - const __m256i x = _mm256_subs_epi16(qs0, ps0); - filter = _mm256_adds_epi16(filter, x); - filter = _mm256_adds_epi16(filter, x); - filter = _mm256_adds_epi16(filter, x); - pixel_clamp(&pmin, &pmax, &filter); - filter = _mm256_and_si256(filter, *mask); - - const __m256i t3 = _mm256_set1_epi16(3); - const __m256i t4 = _mm256_set1_epi16(4); - - __m256i filter1 = _mm256_adds_epi16(filter, t4); - __m256i filter2 = _mm256_adds_epi16(filter, t3); - pixel_clamp(&pmin, &pmax, &filter1); - pixel_clamp(&pmin, &pmax, &filter2); - filter1 = _mm256_srai_epi16(filter1, 3); - filter2 = _mm256_srai_epi16(filter2, 3); - - qs0 = _mm256_subs_epi16(qs0, filter1); - pixel_clamp(&pmin, &pmax, &qs0); - ps0 = _mm256_adds_epi16(ps0, filter2); - pixel_clamp(&pmin, &pmax, &ps0); - - qs[0] = _mm256_adds_epi16(qs0, t80); - ps[0] = _mm256_adds_epi16(ps0, t80); - - filter = _mm256_adds_epi16(filter1, one); - filter = _mm256_srai_epi16(filter, 1); - filter = _mm256_andnot_si256(hev, filter); - - qs1 = _mm256_subs_epi16(qs1, filter); - pixel_clamp(&pmin, &pmax, &qs1); - ps1 = _mm256_adds_epi16(ps1, filter); - pixel_clamp(&pmin, &pmax, &ps1); - - qs[1] = _mm256_adds_epi16(qs1, t80); - ps[1] = _mm256_adds_epi16(ps1, t80); -} -#endif // #if !CONFIG_PARALLEL_DEBLOCKING - -#if CONFIG_PARALLEL_DEBLOCKING void aom_highbd_lpf_horizontal_16_dual_avx2(uint16_t *s, int p, const uint8_t *blt, const uint8_t *lt, @@ -248,626 +60,3 @@ void aom_highbd_lpf_vertical_8_dual_avx2( aom_highbd_lpf_vertical_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, limit1, thresh1, bd); } -#else -void aom_highbd_lpf_horizontal_16_dual_avx2(uint16_t *s, int pitch, - const uint8_t *blt, - const uint8_t *lt, - const uint8_t *thr, int bd) { - __m256i blimit, limit, thresh; - get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh); - - __m256i p[8], q[8]; - load_highbd_pixel(s, 8, pitch, p, q); - - __m256i mask; - highbd_filter_mask(p, q, &limit, &blimit, &mask); - - __m256i flat, flat2; - const __m256i one = _mm256_set1_epi16(1); - highbd_flat_mask4(&one, p, q, &flat, bd); - highbd_flat_mask5(&one, p, q, &flat2, bd); - - flat = _mm256_and_si256(flat, mask); - flat2 = _mm256_and_si256(flat2, flat); - - __m256i ps[2], qs[2]; - highbd_filter4(p, q, &mask, &thresh, bd, ps, qs); - - // flat and wide flat calculations - __m256i flat_p[3], flat_q[3]; - __m256i flat2_p[7], flat2_q[7]; - { - const __m256i eight = _mm256_set1_epi16(8); - const __m256i four = _mm256_set1_epi16(4); - - __m256i sum_p = _mm256_add_epi16(_mm256_add_epi16(p[6], p[5]), - _mm256_add_epi16(p[4], p[3])); - __m256i sum_q = _mm256_add_epi16(_mm256_add_epi16(q[6], q[5]), - _mm256_add_epi16(q[4], q[3])); - - __m256i sum_lp = _mm256_add_epi16(p[0], _mm256_add_epi16(p[2], p[1])); - sum_p = _mm256_add_epi16(sum_p, sum_lp); - - __m256i sum_lq = _mm256_add_epi16(q[0], _mm256_add_epi16(q[2], q[1])); - sum_q = _mm256_add_epi16(sum_q, sum_lq); - sum_p = _mm256_add_epi16(eight, _mm256_add_epi16(sum_p, sum_q)); - sum_lp = _mm256_add_epi16(four, _mm256_add_epi16(sum_lp, sum_lq)); - - flat2_p[0] = _mm256_srli_epi16( - _mm256_add_epi16(sum_p, _mm256_add_epi16(p[7], p[0])), 4); - flat2_q[0] = _mm256_srli_epi16( - _mm256_add_epi16(sum_p, _mm256_add_epi16(q[7], q[0])), 4); - flat_p[0] = _mm256_srli_epi16( - _mm256_add_epi16(sum_lp, _mm256_add_epi16(p[3], p[0])), 3); - flat_q[0] = _mm256_srli_epi16( - _mm256_add_epi16(sum_lp, _mm256_add_epi16(q[3], q[0])), 3); - - __m256i sum_p7 = _mm256_add_epi16(p[7], p[7]); - __m256i sum_q7 = _mm256_add_epi16(q[7], q[7]); - __m256i sum_p3 = _mm256_add_epi16(p[3], p[3]); - __m256i sum_q3 = _mm256_add_epi16(q[3], q[3]); - - sum_q = _mm256_sub_epi16(sum_p, p[6]); - sum_p = _mm256_sub_epi16(sum_p, q[6]); - flat2_p[1] = _mm256_srli_epi16( - _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[1])), 4); - flat2_q[1] = _mm256_srli_epi16( - _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[1])), 4); - - sum_lq = _mm256_sub_epi16(sum_lp, p[2]); - sum_lp = _mm256_sub_epi16(sum_lp, q[2]); - flat_p[1] = _mm256_srli_epi16( - _mm256_add_epi16(sum_lp, _mm256_add_epi16(sum_p3, p[1])), 3); - flat_q[1] = _mm256_srli_epi16( - _mm256_add_epi16(sum_lq, _mm256_add_epi16(sum_q3, q[1])), 3); - - sum_p7 = _mm256_add_epi16(sum_p7, p[7]); - sum_q7 = _mm256_add_epi16(sum_q7, q[7]); - sum_p3 = _mm256_add_epi16(sum_p3, p[3]); - sum_q3 = _mm256_add_epi16(sum_q3, q[3]); - - sum_p = _mm256_sub_epi16(sum_p, q[5]); - sum_q = _mm256_sub_epi16(sum_q, p[5]); - flat2_p[2] = _mm256_srli_epi16( - _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[2])), 4); - flat2_q[2] = _mm256_srli_epi16( - _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[2])), 4); - - sum_lp = _mm256_sub_epi16(sum_lp, q[1]); - sum_lq = _mm256_sub_epi16(sum_lq, p[1]); - flat_p[2] = _mm256_srli_epi16( - _mm256_add_epi16(sum_lp, _mm256_add_epi16(sum_p3, p[2])), 3); - flat_q[2] = _mm256_srli_epi16( - _mm256_add_epi16(sum_lq, _mm256_add_epi16(sum_q3, q[2])), 3); - - int i; - for (i = 3; i < 7; ++i) { - sum_p7 = _mm256_add_epi16(sum_p7, p[7]); - sum_q7 = _mm256_add_epi16(sum_q7, q[7]); - sum_p = _mm256_sub_epi16(sum_p, q[7 - i]); - sum_q = _mm256_sub_epi16(sum_q, p[7 - i]); - flat2_p[i] = _mm256_srli_epi16( - _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[i])), 4); - flat2_q[i] = _mm256_srli_epi16( - _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[i])), 4); - } - } - - // highbd_filter8 - p[2] = _mm256_andnot_si256(flat, p[2]); - // p2 remains unchanged if !(flat && mask) - flat_p[2] = _mm256_and_si256(flat, flat_p[2]); - // when (flat && mask) - p[2] = _mm256_or_si256(p[2], flat_p[2]); // full list of p2 values - q[2] = _mm256_andnot_si256(flat, q[2]); - flat_q[2] = _mm256_and_si256(flat, flat_q[2]); - q[2] = _mm256_or_si256(q[2], flat_q[2]); // full list of q2 values - - int i; - for (i = 1; i >= 0; i--) { - ps[i] = _mm256_andnot_si256(flat, ps[i]); - flat_p[i] = _mm256_and_si256(flat, flat_p[i]); - p[i] = _mm256_or_si256(ps[i], flat_p[i]); - qs[i] = _mm256_andnot_si256(flat, qs[i]); - flat_q[i] = _mm256_and_si256(flat, flat_q[i]); - q[i] = _mm256_or_si256(qs[i], flat_q[i]); - } - - // highbd_filter16 - - for (i = 6; i >= 0; i--) { - // p[i] remains unchanged if !(flat2 && flat && mask) - p[i] = _mm256_andnot_si256(flat2, p[i]); - flat2_p[i] = _mm256_and_si256(flat2, flat2_p[i]); - // get values for when (flat2 && flat && mask) - p[i] = _mm256_or_si256(p[i], flat2_p[i]); // full list of p values - - q[i] = _mm256_andnot_si256(flat2, q[i]); - flat2_q[i] = _mm256_and_si256(flat2, flat2_q[i]); - q[i] = _mm256_or_si256(q[i], flat2_q[i]); - _mm256_storeu_si256((__m256i *)(s - (i + 1) * pitch), p[i]); - _mm256_storeu_si256((__m256i *)(s + i * pitch), q[i]); - } -} - -static INLINE void highbd_transpose16x16(uint16_t *src, int src_p, - uint16_t *dst, int dst_p) { - __m256i x[16]; - int i; - for (i = 0; i < 16; ++i) { - x[i] = _mm256_loadu_si256((const __m256i *)src); - src += src_p; - } - mm256_transpose_16x16(x, x); - for (i = 0; i < 16; ++i) { - _mm256_storeu_si256((__m256i *)dst, x[i]); - dst += dst_p; - } -} - -void aom_highbd_lpf_vertical_16_dual_avx2(uint16_t *s, int p, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, int bd) { - DECLARE_ALIGNED(16, uint16_t, t_dst[256]); - - // Transpose 16x16 - highbd_transpose16x16(s - 8, p, t_dst, 16); - - // Loop filtering - aom_highbd_lpf_horizontal_16_dual_avx2(t_dst + 8 * 16, 16, blimit, limit, - thresh, bd); - - // Transpose back - highbd_transpose16x16(t_dst, 16, s - 8, p); -} - -static INLINE void get_dual_limit(const uint8_t *b0, const uint8_t *l0, - const uint8_t *t0, const uint8_t *b1, - const uint8_t *l1, const uint8_t *t1, int bd, - __m256i *blt, __m256i *lt, __m256i *thr) { - const __m128i z128 = _mm_setzero_si128(); - const __m128i blimit0 = - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)b0), z128); - const __m128i limit0 = - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l0), z128); - const __m128i thresh0 = - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t0), z128); - const __m128i blimit1 = - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)b1), z128); - const __m128i limit1 = - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l1), z128); - const __m128i thresh1 = - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t1), z128); - - *blt = _mm256_inserti128_si256(_mm256_castsi128_si256(blimit0), blimit1, 1); - *lt = _mm256_inserti128_si256(_mm256_castsi128_si256(limit0), limit1, 1); - *thr = _mm256_inserti128_si256(_mm256_castsi128_si256(thresh0), thresh1, 1); - - int shift = bd - 8; - *blt = _mm256_slli_epi16(*blt, shift); - *lt = _mm256_slli_epi16(*lt, shift); - *thr = _mm256_slli_epi16(*thr, shift); -} - -void aom_highbd_lpf_horizontal_4_dual_avx2( - uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, - const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, - const uint8_t *_thresh1, int bd) { - __m256i p3 = _mm256_loadu_si256((__m256i *)(s - 4 * p)); - __m256i p2 = _mm256_loadu_si256((__m256i *)(s - 3 * p)); - __m256i p1 = _mm256_loadu_si256((__m256i *)(s - 2 * p)); - __m256i p0 = _mm256_loadu_si256((__m256i *)(s - 1 * p)); - __m256i q0 = _mm256_loadu_si256((__m256i *)(s - 0 * p)); - __m256i q1 = _mm256_loadu_si256((__m256i *)(s + 1 * p)); - __m256i q2 = _mm256_loadu_si256((__m256i *)(s + 2 * p)); - __m256i q3 = _mm256_loadu_si256((__m256i *)(s + 3 * p)); - - const __m256i abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p1, p0)); - const __m256i abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q1, q0)); - - __m256i abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p0, q0)); - __m256i abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p1, q1)); - - __m256i blimit, limit, thresh; - get_dual_limit(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, - &blimit, &limit, &thresh); - - __m256i t80, tff80, tffe0, t1f, t7f; - if (bd == 8) { - t80 = _mm256_set1_epi16(0x80); - tff80 = _mm256_set1_epi16(0xff80); - tffe0 = _mm256_set1_epi16(0xffe0); - t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 8); - t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 8); - } else if (bd == 10) { - t80 = _mm256_slli_epi16(_mm256_set1_epi16(0x80), 2); - tff80 = _mm256_slli_epi16(_mm256_set1_epi16(0xff80), 2); - tffe0 = _mm256_slli_epi16(_mm256_set1_epi16(0xffe0), 2); - t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 6); - t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 6); - } else { // bd == 12 - t80 = _mm256_slli_epi16(_mm256_set1_epi16(0x80), 4); - tff80 = _mm256_slli_epi16(_mm256_set1_epi16(0xff80), 4); - tffe0 = _mm256_slli_epi16(_mm256_set1_epi16(0xffe0), 4); - t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 4); - t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 4); - } - - __m256i ps1 = - _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s - 2 * p)), t80); - __m256i ps0 = - _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s - 1 * p)), t80); - __m256i qs0 = - _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s + 0 * p)), t80); - __m256i qs1 = - _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s + 1 * p)), t80); - - // filter_mask and hev_mask - const __m256i zero = _mm256_setzero_si256(); - __m256i flat = _mm256_max_epi16(abs_p1p0, abs_q1q0); - __m256i hev = _mm256_subs_epu16(flat, thresh); - const __m256i ffff = _mm256_set1_epi16(0xFFFF); - hev = _mm256_xor_si256(_mm256_cmpeq_epi16(hev, zero), ffff); - - abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1); - __m256i mask = - _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), blimit); - mask = _mm256_xor_si256(_mm256_cmpeq_epi16(mask, zero), ffff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - // So taking maximums continues to work: - const __m256i one = _mm256_set1_epi16(1); - mask = _mm256_and_si256(mask, _mm256_adds_epu16(limit, one)); - mask = _mm256_max_epi16(flat, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - __m256i work = _mm256_max_epi16( - _mm256_or_si256(_mm256_subs_epu16(p2, p1), _mm256_subs_epu16(p1, p2)), - _mm256_or_si256(_mm256_subs_epu16(p3, p2), _mm256_subs_epu16(p2, p3))); - mask = _mm256_max_epi16(work, mask); - work = _mm256_max_epi16( - _mm256_or_si256(_mm256_subs_epu16(q2, q1), _mm256_subs_epu16(q1, q2)), - _mm256_or_si256(_mm256_subs_epu16(q3, q2), _mm256_subs_epu16(q2, q3))); - mask = _mm256_max_epi16(work, mask); - mask = _mm256_subs_epu16(mask, limit); - mask = _mm256_cmpeq_epi16(mask, zero); - - // filter4 - const __m256i pmax = _mm256_subs_epi16( - _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80); - const __m256i pmin = _mm256_subs_epi16(zero, t80); - - __m256i filt = _mm256_subs_epi16(ps1, qs1); - pixel_clamp(&pmin, &pmax, &filt); - filt = _mm256_and_si256(filt, hev); - __m256i work_a = _mm256_subs_epi16(qs0, ps0); - filt = _mm256_adds_epi16(filt, work_a); - filt = _mm256_adds_epi16(filt, work_a); - filt = _mm256_adds_epi16(filt, work_a); - pixel_clamp(&pmin, &pmax, &filt); - - // (aom_filter + 3 * (qs0 - ps0)) & mask - filt = _mm256_and_si256(filt, mask); - - const __m256i t4 = _mm256_set1_epi16(4); - const __m256i t3 = _mm256_set1_epi16(3); - - __m256i filter1 = _mm256_adds_epi16(filt, t4); - pixel_clamp(&pmin, &pmax, &filter1); - __m256i filter2 = _mm256_adds_epi16(filt, t3); - pixel_clamp(&pmin, &pmax, &filter2); - - // Filter1 >> 3 - work_a = _mm256_cmpgt_epi16(zero, filter1); // get the values that are <0 - filter1 = _mm256_srli_epi16(filter1, 3); - work_a = _mm256_and_si256(work_a, tffe0); // sign bits for the values < 0 - filter1 = _mm256_and_si256(filter1, t1f); // clamp the range - filter1 = _mm256_or_si256(filter1, work_a); // reinsert the sign bits - - // Filter2 >> 3 - work_a = _mm256_cmpgt_epi16(zero, filter2); - filter2 = _mm256_srli_epi16(filter2, 3); - work_a = _mm256_and_si256(work_a, tffe0); - filter2 = _mm256_and_si256(filter2, t1f); - filter2 = _mm256_or_si256(filter2, work_a); - - // filt >> 1 - // equivalent to shifting 0x1f left by bitdepth - 8 - // and setting new bits to 1 - filt = _mm256_adds_epi16(filter1, one); - work_a = _mm256_cmpgt_epi16(zero, filt); - filt = _mm256_srli_epi16(filt, 1); - work_a = _mm256_and_si256(work_a, tff80); - filt = _mm256_and_si256(filt, t7f); - filt = _mm256_or_si256(filt, work_a); - - filt = _mm256_andnot_si256(hev, filt); - - filter1 = _mm256_subs_epi16(qs0, filter1); - pixel_clamp(&pmin, &pmax, &filter1); - q0 = _mm256_adds_epi16(filter1, t80); - - filter1 = _mm256_subs_epi16(qs1, filt); - pixel_clamp(&pmin, &pmax, &filter1); - q1 = _mm256_adds_epi16(filter1, t80); - - filter2 = _mm256_adds_epi16(ps0, filter2); - pixel_clamp(&pmin, &pmax, &filter2); - p0 = _mm256_adds_epi16(filter2, t80); - - filter2 = _mm256_adds_epi16(ps1, filt); - pixel_clamp(&pmin, &pmax, &filter2); - p1 = _mm256_adds_epi16(filter2, t80); - - _mm256_storeu_si256((__m256i *)(s - 2 * p), p1); - _mm256_storeu_si256((__m256i *)(s - 1 * p), p0); - _mm256_storeu_si256((__m256i *)(s + 0 * p), q0); - _mm256_storeu_si256((__m256i *)(s + 1 * p), q1); -} - -void aom_highbd_lpf_horizontal_8_dual_avx2( - uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, - const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, - const uint8_t *_thresh1, int bd) { - DECLARE_ALIGNED(16, uint16_t, flat_op2[16]); - DECLARE_ALIGNED(16, uint16_t, flat_op1[16]); - DECLARE_ALIGNED(16, uint16_t, flat_op0[16]); - DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]); - DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]); - DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]); - - __m256i p3 = _mm256_loadu_si256((__m256i *)(s - 4 * p)); - __m256i q3 = _mm256_loadu_si256((__m256i *)(s + 3 * p)); - __m256i p2 = _mm256_loadu_si256((__m256i *)(s - 3 * p)); - __m256i q2 = _mm256_loadu_si256((__m256i *)(s + 2 * p)); - __m256i p1 = _mm256_loadu_si256((__m256i *)(s - 2 * p)); - __m256i q1 = _mm256_loadu_si256((__m256i *)(s + 1 * p)); - __m256i p0 = _mm256_loadu_si256((__m256i *)(s - 1 * p)); - __m256i q0 = _mm256_loadu_si256((__m256i *)(s + 0 * p)); - - __m256i blimit, limit, thresh; - get_dual_limit(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, - &blimit, &limit, &thresh); - - __m256i t80; - if (bd == 8) { - t80 = _mm256_set1_epi16(0x80); - } else if (bd == 10) { - t80 = _mm256_set1_epi16(0x200); - } else { // bd == 12 - t80 = _mm256_set1_epi16(0x800); - } - - __m256i ps1, ps0, qs0, qs1; - ps1 = _mm256_subs_epi16(p1, t80); - ps0 = _mm256_subs_epi16(p0, t80); - qs0 = _mm256_subs_epi16(q0, t80); - qs1 = _mm256_subs_epi16(q1, t80); - - // filter_mask and hev_mask - __m256i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; - abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p1, p0)); - abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q1, q0)); - - abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p0, q0)); - abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p1, q1)); - __m256i flat = _mm256_max_epi16(abs_p1p0, abs_q1q0); - __m256i hev = _mm256_subs_epu16(flat, thresh); - const __m256i zero = _mm256_set1_epi16(0); - const __m256i ffff = _mm256_set1_epi16(0xFFFF); - hev = _mm256_xor_si256(_mm256_cmpeq_epi16(hev, zero), ffff); - - abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1); - __m256i mask = - _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), blimit); - mask = _mm256_xor_si256(_mm256_cmpeq_epi16(mask, zero), ffff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - // So taking maximums continues to work: - - const __m256i one = _mm256_set1_epi16(1); - mask = _mm256_and_si256(mask, _mm256_adds_epu16(limit, one)); - mask = _mm256_max_epi16(abs_p1p0, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - mask = _mm256_max_epi16(abs_q1q0, mask); - // mask |= (abs(q1 - q0) > limit) * -1; - - work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p2, p1)), - _mm256_abs_epi16(_mm256_sub_epi16(q2, q1))); - mask = _mm256_max_epi16(work, mask); - work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p3, p2)), - _mm256_abs_epi16(_mm256_sub_epi16(q3, q2))); - mask = _mm256_max_epi16(work, mask); - mask = _mm256_subs_epu16(mask, limit); - mask = _mm256_cmpeq_epi16(mask, zero); - - // flat_mask4 - flat = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p2, p0)), - _mm256_abs_epi16(_mm256_sub_epi16(q2, q0))); - work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p3, p0)), - _mm256_abs_epi16(_mm256_sub_epi16(q3, q0))); - flat = _mm256_max_epi16(work, flat); - flat = _mm256_max_epi16(abs_p1p0, flat); - flat = _mm256_max_epi16(abs_q1q0, flat); - - if (bd == 8) - flat = _mm256_subs_epu16(flat, one); - else if (bd == 10) - flat = _mm256_subs_epu16(flat, _mm256_slli_epi16(one, 2)); - else // bd == 12 - flat = _mm256_subs_epu16(flat, _mm256_slli_epi16(one, 4)); - - flat = _mm256_cmpeq_epi16(flat, zero); - flat = _mm256_and_si256(flat, mask); // flat & mask - - // Added before shift for rounding part of ROUND_POWER_OF_TWO - __m256i workp_a, workp_b, workp_shft; - workp_a = - _mm256_add_epi16(_mm256_add_epi16(p3, p3), _mm256_add_epi16(p2, p1)); - const __m256i four = _mm256_set1_epi16(4); - workp_a = _mm256_add_epi16(_mm256_add_epi16(workp_a, four), p0); - workp_b = _mm256_add_epi16(_mm256_add_epi16(q0, p2), p3); - workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); - _mm256_storeu_si256((__m256i *)&flat_op2[0], workp_shft); - - workp_b = _mm256_add_epi16(_mm256_add_epi16(q0, q1), p1); - workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); - _mm256_storeu_si256((__m256i *)&flat_op1[0], workp_shft); - - workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p3), q2); - workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, p1), p0); - workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); - _mm256_storeu_si256((__m256i *)&flat_op0[0], workp_shft); - - workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p3), q3); - workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, p0), q0); - workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); - _mm256_storeu_si256((__m256i *)&flat_oq0[0], workp_shft); - - workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p2), q3); - workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, q0), q1); - workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); - _mm256_storeu_si256((__m256i *)&flat_oq1[0], workp_shft); - - workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p1), q3); - workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, q1), q2); - workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); - _mm256_storeu_si256((__m256i *)&flat_oq2[0], workp_shft); - - // lp filter - const __m256i pmax = _mm256_subs_epi16( - _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80); - const __m256i pmin = _mm256_subs_epi16(zero, t80); - - __m256i filt, filter1, filter2, work_a; - filt = _mm256_subs_epi16(ps1, qs1); - pixel_clamp(&pmin, &pmax, &filt); - filt = _mm256_and_si256(filt, hev); - work_a = _mm256_subs_epi16(qs0, ps0); - filt = _mm256_adds_epi16(filt, work_a); - filt = _mm256_adds_epi16(filt, work_a); - filt = _mm256_adds_epi16(filt, work_a); - // (aom_filter + 3 * (qs0 - ps0)) & mask - pixel_clamp(&pmin, &pmax, &filt); - filt = _mm256_and_si256(filt, mask); - - const __m256i t4 = _mm256_set1_epi16(4); - const __m256i t3 = _mm256_set1_epi16(3); - - filter1 = _mm256_adds_epi16(filt, t4); - filter2 = _mm256_adds_epi16(filt, t3); - - // Filter1 >> 3 - pixel_clamp(&pmin, &pmax, &filter1); - filter1 = _mm256_srai_epi16(filter1, 3); - - // Filter2 >> 3 - pixel_clamp(&pmin, &pmax, &filter2); - filter2 = _mm256_srai_epi16(filter2, 3); - - // filt >> 1 - filt = _mm256_adds_epi16(filter1, one); - filt = _mm256_srai_epi16(filt, 1); - // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; - filt = _mm256_andnot_si256(hev, filt); - - work_a = _mm256_subs_epi16(qs0, filter1); - pixel_clamp(&pmin, &pmax, &work_a); - work_a = _mm256_adds_epi16(work_a, t80); - q0 = _mm256_loadu_si256((__m256i *)flat_oq0); - work_a = _mm256_andnot_si256(flat, work_a); - q0 = _mm256_and_si256(flat, q0); - q0 = _mm256_or_si256(work_a, q0); - - work_a = _mm256_subs_epi16(qs1, filt); - pixel_clamp(&pmin, &pmax, &work_a); - work_a = _mm256_adds_epi16(work_a, t80); - q1 = _mm256_loadu_si256((__m256i *)flat_oq1); - work_a = _mm256_andnot_si256(flat, work_a); - q1 = _mm256_and_si256(flat, q1); - q1 = _mm256_or_si256(work_a, q1); - - work_a = _mm256_loadu_si256((__m256i *)(s + 2 * p)); - q2 = _mm256_loadu_si256((__m256i *)flat_oq2); - work_a = _mm256_andnot_si256(flat, work_a); - q2 = _mm256_and_si256(flat, q2); - q2 = _mm256_or_si256(work_a, q2); - - work_a = _mm256_adds_epi16(ps0, filter2); - pixel_clamp(&pmin, &pmax, &work_a); - work_a = _mm256_adds_epi16(work_a, t80); - p0 = _mm256_loadu_si256((__m256i *)flat_op0); - work_a = _mm256_andnot_si256(flat, work_a); - p0 = _mm256_and_si256(flat, p0); - p0 = _mm256_or_si256(work_a, p0); - - work_a = _mm256_adds_epi16(ps1, filt); - pixel_clamp(&pmin, &pmax, &work_a); - work_a = _mm256_adds_epi16(work_a, t80); - p1 = _mm256_loadu_si256((__m256i *)flat_op1); - work_a = _mm256_andnot_si256(flat, work_a); - p1 = _mm256_and_si256(flat, p1); - p1 = _mm256_or_si256(work_a, p1); - - work_a = _mm256_loadu_si256((__m256i *)(s - 3 * p)); - p2 = _mm256_loadu_si256((__m256i *)flat_op2); - work_a = _mm256_andnot_si256(flat, work_a); - p2 = _mm256_and_si256(flat, p2); - p2 = _mm256_or_si256(work_a, p2); - - _mm256_storeu_si256((__m256i *)(s - 3 * p), p2); - _mm256_storeu_si256((__m256i *)(s - 2 * p), p1); - _mm256_storeu_si256((__m256i *)(s - 1 * p), p0); - _mm256_storeu_si256((__m256i *)(s + 0 * p), q0); - _mm256_storeu_si256((__m256i *)(s + 1 * p), q1); - _mm256_storeu_si256((__m256i *)(s + 2 * p), q2); -} - -void aom_highbd_lpf_vertical_4_dual_avx2( - uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, - const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1, int bd) { - DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); - uint16_t *src[2]; - uint16_t *dst[2]; - - // Transpose 8x16 - highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); - - // Loop filtering - aom_highbd_lpf_horizontal_4_dual_avx2(t_dst + 4 * 16, 16, blimit0, limit0, - thresh0, blimit1, limit1, thresh1, bd); - src[0] = t_dst; - src[1] = t_dst + 8; - dst[0] = s - 4; - dst[1] = s - 4 + p * 8; - - // Transpose back - highbd_transpose(src, 16, dst, p, 2); -} - -void aom_highbd_lpf_vertical_8_dual_avx2( - uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, - const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1, int bd) { - DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); - uint16_t *src[2]; - uint16_t *dst[2]; - - // Transpose 8x16 - highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); - - // Loop filtering - aom_highbd_lpf_horizontal_8_dual_avx2(t_dst + 4 * 16, 16, blimit0, limit0, - thresh0, blimit1, limit1, thresh1, bd); - src[0] = t_dst; - src[1] = t_dst + 8; - - dst[0] = s - 4; - dst[1] = s - 4 + p * 8; - - // Transpose back - highbd_transpose(src, 16, dst, p, 2); -} -#endif // CONFIG_PARALLEL_DEBLOCKING diff --git a/aom_dsp/x86/highbd_loopfilter_sse2.c b/aom_dsp/x86/highbd_loopfilter_sse2.c index e2f641b32..35f2c35ca 100644 --- a/aom_dsp/x86/highbd_loopfilter_sse2.c +++ b/aom_dsp/x86/highbd_loopfilter_sse2.c @@ -502,14 +502,12 @@ static INLINE void highbd_lpf_horz_edge_8_internal(uint16_t *s, int pitch, // Note: // highbd_lpf_horz_edge_8_8p() output 8 pixels per register // highbd_lpf_horz_edge_8_4p() output 4 pixels per register -#if CONFIG_PARALLEL_DEBLOCKING static INLINE void highbd_lpf_horz_edge_8_4p(uint16_t *s, int pitch, const uint8_t *blt, const uint8_t *lt, const uint8_t *thr, int bd) { highbd_lpf_horz_edge_8_internal(s, pitch, blt, lt, thr, bd, FOUR_PIXELS); } -#endif // #if CONFIG_PARALLEL_DEBLOCKING static INLINE void highbd_lpf_horz_edge_8_8p(uint16_t *s, int pitch, const uint8_t *blt, @@ -522,44 +520,26 @@ void aom_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, const uint8_t *_blimit, const uint8_t *_limit, const uint8_t *_thresh, int bd) { -#if CONFIG_PARALLEL_DEBLOCKING highbd_lpf_horz_edge_8_4p(s, p, _blimit, _limit, _thresh, bd); -#else - highbd_lpf_horz_edge_8_8p(s, p, _blimit, _limit, _thresh, bd); -#endif } void aom_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int p, const uint8_t *_blimit, const uint8_t *_limit, const uint8_t *_thresh, int bd) { -#if CONFIG_PARALLEL_DEBLOCKING highbd_lpf_horz_edge_8_4p(s, p, _blimit, _limit, _thresh, bd); -#else - highbd_lpf_horz_edge_8_8p(s, p, _blimit, _limit, _thresh, bd); - highbd_lpf_horz_edge_8_8p(s + 8, p, _blimit, _limit, _thresh, bd); -#endif } static INLINE void store_horizontal_8(const __m128i *p2, const __m128i *p1, const __m128i *p0, const __m128i *q0, const __m128i *q1, const __m128i *q2, int p, uint16_t *s) { -#if CONFIG_PARALLEL_DEBLOCKING _mm_storel_epi64((__m128i *)(s - 3 * p), *p2); _mm_storel_epi64((__m128i *)(s - 2 * p), *p1); _mm_storel_epi64((__m128i *)(s - 1 * p), *p0); _mm_storel_epi64((__m128i *)(s + 0 * p), *q0); _mm_storel_epi64((__m128i *)(s + 1 * p), *q1); _mm_storel_epi64((__m128i *)(s + 2 * p), *q2); -#else - _mm_store_si128((__m128i *)(s - 3 * p), *p2); - _mm_store_si128((__m128i *)(s - 2 * p), *p1); - _mm_store_si128((__m128i *)(s - 1 * p), *p0); - _mm_store_si128((__m128i *)(s + 0 * p), *q0); - _mm_store_si128((__m128i *)(s + 1 * p), *q1); - _mm_store_si128((__m128i *)(s + 2 * p), *q2); -#endif } void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, @@ -797,11 +777,7 @@ void aom_highbd_lpf_horizontal_8_dual_sse2( const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1, int bd) { aom_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd); -#if CONFIG_PARALLEL_DEBLOCKING aom_highbd_lpf_horizontal_8_sse2(s + 4, p, _blimit1, _limit1, _thresh1, bd); -#else - aom_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd); -#endif } void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, @@ -811,18 +787,10 @@ void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, const __m128i zero = _mm_set1_epi16(0); __m128i blimit, limit, thresh; __m128i mask, hev, flat; -#if !(CONFIG_PARALLEL_DEBLOCKING) - __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); - __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); -#endif __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); -#if !(CONFIG_PARALLEL_DEBLOCKING) - __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); - __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); -#endif const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); const __m128i abs_q1q0 = @@ -905,16 +873,6 @@ void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); mask = _mm_max_epi16(flat, mask); -#if !(CONFIG_PARALLEL_DEBLOCKING) - __m128i work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)), - _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3))); - mask = _mm_max_epi16(work, mask); - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)), - _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); - mask = _mm_max_epi16(work, mask); -#endif mask = _mm_subs_epu16(mask, limit); mask = _mm_cmpeq_epi16(mask, zero); @@ -980,17 +938,10 @@ void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, p1 = _mm_adds_epi16(ps1, filt); pixel_clamp(&pmin, &pmax, &p1); p1 = _mm_adds_epi16(p1, t80); -#if CONFIG_PARALLEL_DEBLOCKING _mm_storel_epi64((__m128i *)(s - 2 * p), p1); _mm_storel_epi64((__m128i *)(s - 1 * p), p0); _mm_storel_epi64((__m128i *)(s + 0 * p), q0); _mm_storel_epi64((__m128i *)(s + 1 * p), q1); -#else - _mm_storeu_si128((__m128i *)(s - 2 * p), p1); - _mm_storeu_si128((__m128i *)(s - 1 * p), p0); - _mm_storeu_si128((__m128i *)(s + 0 * p), q0); - _mm_storeu_si128((__m128i *)(s + 1 * p), q1); -#endif } void aom_highbd_lpf_horizontal_4_dual_sse2( @@ -998,11 +949,7 @@ void aom_highbd_lpf_horizontal_4_dual_sse2( const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1, int bd) { aom_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd); -#if CONFIG_PARALLEL_DEBLOCKING aom_highbd_lpf_horizontal_4_sse2(s + 4, p, _blimit1, _limit1, _thresh1, bd); -#else - aom_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd); -#endif } void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit, @@ -1134,13 +1081,7 @@ void aom_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p, // Transpose 16x16 highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16); highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); - -#if CONFIG_PARALLEL_DEBLOCKING highbd_lpf_horz_edge_8_8p(t_dst + 8 * 16, 16, blimit, limit, thresh, bd); -#else - aom_highbd_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit, - thresh, bd); -#endif // Transpose back highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p); diff --git a/aom_dsp/x86/loopfilter_sse2.c b/aom_dsp/x86/loopfilter_sse2.c index 397011d76..64f7756b3 100644 --- a/aom_dsp/x86/loopfilter_sse2.c +++ b/aom_dsp/x86/loopfilter_sse2.c @@ -20,7 +20,6 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) { return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); } -#if CONFIG_PARALLEL_DEBLOCKING // filter_mask and hev_mask #define FILTER_HEV_MASK4 \ do { \ @@ -52,7 +51,6 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) { mask = _mm_cmpeq_epi8(mask, zero); \ mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \ } while (0) -#endif // CONFIG_PARALLEL_DEBLOCKING // filter_mask and hev_mask #define FILTER_HEV_MASK \ @@ -151,47 +149,21 @@ void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, _mm_loadl_epi64((const __m128i *)_limit)); const __m128i thresh = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); -#if !CONFIG_PARALLEL_DEBLOCKING - __m128i p3p2, p2p1, q3q2, q2q1; -#endif // !CONFIG_PARALLEL_DEBLOCKING __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0; __m128i mask, hev; -#if !CONFIG_PARALLEL_DEBLOCKING - p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), - _mm_loadl_epi64((__m128i *)(s - 4 * p))); -#endif // !CONFIG_PARALLEL_DEBLOCKING q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), _mm_loadl_epi64((__m128i *)(s + 1 * p))); q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), _mm_loadl_epi64((__m128i *)(s + 0 * p))); -#if !CONFIG_PARALLEL_DEBLOCKING - q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)), - _mm_loadl_epi64((__m128i *)(s + 3 * p))); -#endif // !CONFIG_PARALLEL_DEBLOCKING p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); -#if !CONFIG_PARALLEL_DEBLOCKING - p2p1 = _mm_unpacklo_epi64(q1p1, p3p2); - q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2); -#endif // !CONFIG_PARALLEL_DEBLOCKING -#if !CONFIG_PARALLEL_DEBLOCKING - FILTER_HEV_MASK; -#else // CONFIG_PARALLEL_DEBLOCKING FILTER_HEV_MASK4; -#endif // !CONFIG_PARALLEL_DEBLOCKING filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0); -#if CONFIG_PARALLEL_DEBLOCKING xx_storel_32(s - 1 * p, ps1ps0); xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 8)); xx_storel_32(s + 0 * p, qs1qs0); xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 8)); -#else - _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0)); // *op1 - _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); // *op0 - _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); // *oq0 - _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0)); // *oq1 -#endif } void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, @@ -205,9 +177,6 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); __m128i x0, x1, x2, x3; -#if !CONFIG_PARALLEL_DEBLOCKING - __m128i p3p2, p2p1, q3q2, q2q1; -#endif // !CONFIG_PARALLEL_DEBLOCKING __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0; __m128i mask, hev; @@ -232,40 +201,21 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, p1p0 = _mm_unpacklo_epi16(q1q0, x1); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 x0 = _mm_unpacklo_epi16(x2, x3); -#if !CONFIG_PARALLEL_DEBLOCKING - // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 - p3p2 = _mm_unpacklo_epi32(p1p0, x0); -#endif // !CONFIG_PARALLEL_DEBLOCKING // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 p1p0 = _mm_unpackhi_epi32(p1p0, x0); -#if !CONFIG_PARALLEL_DEBLOCKING - p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8)); // swap lo and high -#endif // !CONFIG_PARALLEL_DEBLOCKING p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8)); // swap lo and high // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 q1q0 = _mm_unpackhi_epi16(q1q0, x1); // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 x2 = _mm_unpackhi_epi16(x2, x3); -#if !CONFIG_PARALLEL_DEBLOCKING - // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 - q3q2 = _mm_unpackhi_epi32(q1q0, x2); -#endif // !CONFIG_PARALLEL_DEBLOCKING // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 q1q0 = _mm_unpacklo_epi32(q1q0, x2); q0p0 = _mm_unpacklo_epi64(p1p0, q1q0); q1p1 = _mm_unpackhi_epi64(p1p0, q1q0); p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); -#if !CONFIG_PARALLEL_DEBLOCKING - p2p1 = _mm_unpacklo_epi64(q1p1, p3p2); - q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2); -#endif // !CONFIG_PARALLEL_DEBLOCKING -#if !CONFIG_PARALLEL_DEBLOCKING - FILTER_HEV_MASK; -#else // CONFIG_PARALLEL_DEBLOCKING FILTER_HEV_MASK4; -#endif // !CONFIG_PARALLEL_DEBLOCKING filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0); // Transpose 8x4 to 4x8 @@ -277,10 +227,6 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0); // 00 20 01 21 02 22 03 23 04 24 05 25 06 26 07 27 ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0); -#if !CONFIG_PARALLEL_DEBLOCKING - // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 - qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0); -#endif // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0); @@ -288,22 +234,11 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, xx_storel_32(s + 1 * p - 2, _mm_srli_si128(ps1ps0, 4)); xx_storel_32(s + 2 * p - 2, _mm_srli_si128(ps1ps0, 8)); xx_storel_32(s + 3 * p - 2, _mm_srli_si128(ps1ps0, 12)); -#if !CONFIG_PARALLEL_DEBLOCKING - xx_storel_32(s + 4 * p - 2, qs1qs0); - xx_storel_32(s + 5 * p - 2, _mm_srli_si128(qs1qs0, 4)); - xx_storel_32(s + 6 * p - 2, _mm_srli_si128(qs1qs0, 8)); - xx_storel_32(s + 7 * p - 2, _mm_srli_si128(qs1qs0, 12)); -#endif } static INLINE void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) { -#if CONFIG_PARALLEL_DEBLOCKING xx_storel_32(s - (num + 1) * p, x); xx_storel_32(s + num * p, _mm_srli_si128(x, 8)); -#else - xx_storel_64(s - (num + 1) * p, x); - _mm_storeh_pi((__m64 *)(s + num * p), _mm_castsi128_ps(x)); -#endif } void aom_lpf_horizontal_16_sse2(unsigned char *s, int p, @@ -1336,17 +1271,10 @@ void aom_lpf_horizontal_6_sse2(unsigned char *s, int p, p1p0 = _mm_and_si128(flat, flat_p1p0); p1p0 = _mm_or_si128(ps1ps0, p1p0); -#if CONFIG_PARALLEL_DEBLOCKING xx_storel_32(s - 1 * p, p1p0); xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 8)); xx_storel_32(s + 0 * p, q1q0); xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 8)); -#else - xx_storel_64(s - 1 * p, p1p0); - xx_storel_64(s - 2 * p, _mm_srli_si128(p1p0, 8)); - xx_storel_64(s + 0 * p, q1q0); - xx_storel_64(s + 1 * p, _mm_srli_si128(q1q0, 8)); -#endif } } @@ -1526,32 +1454,19 @@ void aom_lpf_horizontal_8_sse2(unsigned char *s, int p, p2 = _mm_and_si128(flat, op2); p2 = _mm_or_si128(work_a, p2); -#if CONFIG_PARALLEL_DEBLOCKING xx_storel_32(s - 1 * p, p1p0); xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 8)); xx_storel_32(s + 0 * p, q1q0); xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 8)); xx_storel_32(s - 3 * p, p2); xx_storel_32(s + 2 * p, q2); -#else - xx_storel_64(s - 1 * p, p1p0); - xx_storel_64(s - 2 * p, _mm_srli_si128(p1p0, 8)); - xx_storel_64(s + 0 * p, q1q0); - xx_storel_64(s + 1 * p, _mm_srli_si128(q1q0, 8)); - xx_storel_64(s - 3 * p, p2); - xx_storel_64(s + 2 * p, q2); -#endif } void aom_lpf_horizontal_16_dual_sse2(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { -#if CONFIG_PARALLEL_DEBLOCKING lpf_horz_edge_16_internal(FOUR_PIXELS, s, p, _blimit, _limit, _thresh); -#else - lpf_horz_edge_16_internal(SIXTEEN_PIXELS, s, p, _blimit, _limit, _thresh); -#endif } void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, @@ -1813,23 +1728,12 @@ void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), _mm_load_si128((const __m128i *)_thresh1)); const __m128i zero = _mm_set1_epi16(0); -#if !CONFIG_PARALLEL_DEBLOCKING - __m128i p3, p2, q2, q3; -#endif // !CONFIG_PARALLEL_DEBLOCKING __m128i p1, p0, q0, q1; __m128i mask, hev, flat; -#if !CONFIG_PARALLEL_DEBLOCKING - p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); - p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); -#endif // !CONFIG_PARALLEL_DEBLOCKING p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); -#if !CONFIG_PARALLEL_DEBLOCKING - q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); -#endif // !CONFIG_PARALLEL_DEBLOCKING // filter_mask and hev_mask { const __m128i abs_p1p0 = @@ -1842,9 +1746,6 @@ void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); -#if !CONFIG_PARALLEL_DEBLOCKING - __m128i work; -#endif // !CONFIG_PARALLEL_DEBLOCKING flat = _mm_max_epu8(abs_p1p0, abs_q1q0); hev = _mm_subs_epu8(flat, thresh); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); @@ -1855,18 +1756,6 @@ void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(flat, mask); -#if !CONFIG_PARALLEL_DEBLOCKING - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), - _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); - mask = _mm_max_epu8(work, mask); - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), - _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); - mask = _mm_max_epu8(work, mask); -#endif // !CONFIG_PARALLEL_DEBLOCKING mask = _mm_subs_epu8(mask, limit); mask = _mm_cmpeq_epi8(mask, zero); } @@ -2010,7 +1899,6 @@ static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1, _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15)); } -#if CONFIG_PARALLEL_DEBLOCKING #define movq(p) _mm_loadl_epi64((const __m128i *)(p)) #define punpcklbw(r0, r1) _mm_unpacklo_epi8(r0, r1) #define punpcklwd(r0, r1) _mm_unpacklo_epi16(r0, r1) @@ -2053,8 +1941,6 @@ static INLINE void transpose16x4(uint8_t *pDst, const ptrdiff_t dstStride, } } -#endif // CONFIG_PARALLEL_DEBLOCKING - static INLINE void transpose6x6(unsigned char *src[], int in_p, unsigned char *dst[], int out_p, int num_6x6_to_transpose) { @@ -2195,27 +2081,13 @@ void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]); -#if !CONFIG_PARALLEL_DEBLOCKING - unsigned char *src[2]; - unsigned char *dst[2]; -#endif // !CONFIG_PARALLEL_DEBLOCKING // Transpose 8x16 transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); // Loop filtering aom_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, blimit1, limit1, thresh1); -#if !CONFIG_PARALLEL_DEBLOCKING - src[0] = t_dst; - src[1] = t_dst + 8; - dst[0] = s - 4; - dst[1] = s - 4 + p * 8; - - // Transpose back - transpose(src, 16, dst, p, 2); -#else // CONFIG_PARALLEL_DEBLOCKING transpose16x4(s - 2, p, t_dst + 16 * 2, 16); -#endif // !CONFIG_PARALLEL_DEBLOCKING } void aom_lpf_vertical_6_sse2(unsigned char *s, int p, diff --git a/av1/common/av1_loopfilter.c b/av1/common/av1_loopfilter.c index fd527db6a..398d70762 100644 --- a/av1/common/av1_loopfilter.c +++ b/av1/common/av1_loopfilter.c @@ -1856,7 +1856,6 @@ void av1_filter_block_plane_ss11_hor(AV1_COMMON *const cm, dst->buf = dst0; } -#if CONFIG_PARALLEL_DEBLOCKING typedef enum EDGE_DIR { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } EDGE_DIR; static const uint32_t av1_prediction_masks[NUM_EDGE_DIRS][BLOCK_SIZES_ALL] = { // mask for vertical edges filtering @@ -2342,7 +2341,6 @@ static void av1_filter_block_plane_horz( } } } -#endif // CONFIG_PARALLEL_DEBLOCKING void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm, struct macroblockd_plane *planes, int start, int stop, @@ -2360,35 +2358,11 @@ void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm, const int plane_start = 0; const int plane_end = nplanes; #endif // CONFIG_LOOPFILTER_LEVEL -#if CONFIG_PARALLEL_DEBLOCKING const int col_start = 0; const int col_end = cm->mi_cols; -#endif int mi_row, mi_col; int plane; -#if !CONFIG_PARALLEL_DEBLOCKING - for (int i = 0; i < nplanes; ++i) - memset(cm->top_txfm_context[i], TX_32X32, cm->mi_cols << TX_UNIT_WIDE_LOG2); - for (mi_row = start; mi_row < stop; mi_row += cm->seq_params.mib_size) { - MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride; - for (int i = 0; i < nplanes; ++i) - memset(cm->left_txfm_context[i], TX_32X32, - MAX_MIB_SIZE << TX_UNIT_HIGH_LOG2); - for (mi_col = 0; mi_col < cm->mi_cols; mi_col += cm->seq_params.mib_size) { - av1_setup_dst_planes(planes, cm->seq_params.sb_size, frame_buffer, mi_row, - mi_col); - - for (plane = plane_start; plane < plane_end; ++plane) { - av1_filter_block_plane_non420_ver(cm, &planes[plane], mi + mi_col, - mi_row, mi_col, plane); - av1_filter_block_plane_non420_hor(cm, &planes[plane], mi + mi_col, - mi_row, mi_col, plane); - } - } - } -#else - // filter all vertical edges in every 64x64 super block for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) { for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) { @@ -2410,7 +2384,6 @@ void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm, } } } -#endif // !CONFIG_PARALLEL_DEBLOCKING } void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, diff --git a/av1/common/thread_common.c b/av1/common/thread_common.c index 25e588ea7..46d29cba1 100644 --- a/av1/common/thread_common.c +++ b/av1/common/thread_common.c @@ -142,7 +142,6 @@ static INLINE void loop_filter_block_plane_hor( } #endif // Row-based multi-threaded loopfilter hook -#if CONFIG_PARALLEL_DEBLOCKING static int loop_filter_ver_row_worker(AV1LfSync *const lf_sync, LFWorkerData *const lf_data) { const int num_planes = lf_data->y_only ? 1 : MAX_MB_PLANE; @@ -226,66 +225,6 @@ static int loop_filter_hor_row_worker(AV1LfSync *const lf_sync, } return 1; } -#else // CONFIG_PARALLEL_DEBLOCKING -static int loop_filter_row_worker(AV1LfSync *const lf_sync, - LFWorkerData *const lf_data) { - const int num_planes = lf_data->y_only ? 1 : MAX_MB_PLANE; - const int sb_cols = mi_cols_aligned_to_sb(lf_data->cm) >> - lf_data->cm->seq_params.mib_size_log2; - int mi_row, mi_col; -#if !CONFIG_EXT_PARTITION_TYPES - enum lf_path path = get_loop_filter_path(lf_data->y_only, lf_data->planes); -#endif // !CONFIG_EXT_PARTITION_TYPES - -#if CONFIG_EXT_PARTITION - printf( - "STOPPING: This code has not been modified to work with the " - "extended coding unit size experiment"); - exit(EXIT_FAILURE); -#endif // CONFIG_EXT_PARTITION - - for (mi_row = lf_data->start; mi_row < lf_data->stop; - mi_row += lf_sync->num_workers * lf_data->cm->seq_params.mib_size) { - MODE_INFO **const mi = - lf_data->cm->mi_grid_visible + mi_row * lf_data->cm->mi_stride; - - for (mi_col = 0; mi_col < lf_data->cm->mi_cols; - mi_col += lf_data->cm->seq_params.mib_size) { - const int r = mi_row >> lf_data->cm->seq_params.mib_size_log2; - const int c = mi_col >> lf_data->cm->seq_params.mib_size_log2; -#if !CONFIG_EXT_PARTITION_TYPES - LOOP_FILTER_MASK lfm; -#endif - int plane; - - sync_read(lf_sync, r, c); - - av1_setup_dst_planes(lf_data->planes, lf_data->cm->seq_params.sb_size, - lf_data->frame_buffer, mi_row, mi_col); -#if CONFIG_EXT_PARTITION_TYPES - for (plane = 0; plane < num_planes; ++plane) { - av1_filter_block_plane_non420_ver(lf_data->cm, &lf_data->planes[plane], - mi + mi_col, mi_row, mi_col, plane); - av1_filter_block_plane_non420_hor(lf_data->cm, &lf_data->planes[plane], - mi + mi_col, mi_row, mi_col, plane); - } -#else - av1_setup_mask(lf_data->cm, mi_row, mi_col, mi + mi_col, - lf_data->cm->mi_stride, &lfm); - - for (plane = 0; plane < num_planes; ++plane) { - loop_filter_block_plane_ver(lf_data->cm, lf_data->planes, plane, - mi + mi_col, mi_row, mi_col, path, &lfm); - loop_filter_block_plane_hor(lf_data->cm, lf_data->planes, plane, - mi + mi_col, mi_row, mi_col, path, &lfm); - } -#endif // CONFIG_EXT_PARTITION_TYPES - sync_write(lf_sync, r, c, sb_cols); - } - } - return 1; -} -#endif // CONFIG_PARALLEL_DEBLOCKING static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, struct macroblockd_plane *planes, int start, @@ -313,16 +252,15 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers); } -// Set up loopfilter thread data. -// The decoder is capping num_workers because it has been observed that -// using more threads on the loopfilter than there are cores will hurt -// performance on Android. This is because the system will only schedule the -// tile decode workers on cores equal to the number of tile columns. Then if -// the decoder tries to use more threads for the loopfilter, it will hurt -// performance because of contention. If the multithreading code changes in -// the future then the number of workers used by the loopfilter should be -// revisited. -#if CONFIG_PARALLEL_DEBLOCKING + // Set up loopfilter thread data. + // The decoder is capping num_workers because it has been observed that + // using more threads on the loopfilter than there are cores will hurt + // performance on Android. This is because the system will only schedule the + // tile decode workers on cores equal to the number of tile columns. Then if + // the decoder tries to use more threads for the loopfilter, it will hurt + // performance because of contention. If the multithreading code changes in + // the future then the number of workers used by the loopfilter should be + // revisited. // Initialize cur_sb_col to -1 for all SB rows. memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows); @@ -382,37 +320,6 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, for (i = 0; i < num_workers; ++i) { winterface->sync(&workers[i]); } -#else // CONFIG_PARALLEL_DEBLOCKING - // Initialize cur_sb_col to -1 for all SB rows. - memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows); - - for (i = 0; i < num_workers; ++i) { - AVxWorker *const worker = &workers[i]; - LFWorkerData *const lf_data = &lf_sync->lfdata[i]; - - worker->hook = (AVxWorkerHook)loop_filter_row_worker; - worker->data1 = lf_sync; - worker->data2 = lf_data; - - // Loopfilter data - av1_loop_filter_data_reset(lf_data, frame, cm, planes); - lf_data->start = start + i * cm->seq_params.mib_size; - lf_data->stop = stop; - lf_data->y_only = y_only; - - // Start loopfiltering - if (i == num_workers - 1) { - winterface->execute(worker); - } else { - winterface->launch(worker); - } - } - - // Wait till all rows are finished - for (i = 0; i < num_workers; ++i) { - winterface->sync(&workers[i]); - } -#endif // CONFIG_PARALLEL_DEBLOCKING } void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake index 13a02a33e..a69a35567 100644 --- a/build/cmake/aom_config_defaults.cmake +++ b/build/cmake/aom_config_defaults.cmake @@ -142,7 +142,6 @@ set(CONFIG_OBU_NO_IVF 0 CACHE NUMBER "AV1 experiment flag.") set(CONFIG_OBU_SIZING 0 CACHE NUMBER "AV1 experiment flag.") set(CONFIG_OPT_REF_MV 0 CACHE NUMBER "AV1 experiment flag.") set(CONFIG_PALETTE_THROUGHPUT 1 CACHE NUMBER "AV1 experiment flag.") -set(CONFIG_PARALLEL_DEBLOCKING 1 CACHE NUMBER "AV1 experiment flag.") set(CONFIG_Q_ADAPT_PROBS 0 CACHE NUMBER "AV1 experiment flag.") set(CONFIG_RD_DEBUG 0 CACHE NUMBER "AV1 experiment flag.") set(CONFIG_RECT_TX_EXT 1 CACHE NUMBER "AV1 experiment flag.") diff --git a/build/cmake/aom_experiment_deps.cmake b/build/cmake/aom_experiment_deps.cmake index 38200f8ff..6106cdb8f 100644 --- a/build/cmake/aom_experiment_deps.cmake +++ b/build/cmake/aom_experiment_deps.cmake @@ -57,9 +57,6 @@ macro (fix_experiment_configs) if (NOT CONFIG_EXT_DELTA_Q) change_config_and_warn(CONFIG_EXT_DELTA_Q 1 CONFIG_LOOPFILTER_LEVEL) endif () - if (NOT CONFIG_PARALLEL_DEBLOCKING) - change_config_and_warn(CONFIG_PARALLEL_DEBLOCKING 1 CONFIG_LOOPFILTER_LEVEL) - endif () endif () if (CONFIG_TXK_SEL) diff --git a/test/lpf_test.cc b/test/lpf_test.cc index a2b7edf19..ca8fbfd61 100644 --- a/test/lpf_test.cc +++ b/test/lpf_test.cc @@ -548,17 +548,6 @@ INSTANTIATE_TEST_CASE_P(AVX2, Loop8Test9Param_hbd, #endif #endif -#if HAVE_AVX2 && (!CONFIG_PARALLEL_DEBLOCKING) -INSTANTIATE_TEST_CASE_P(AVX2, Loop8Test6Param, - ::testing::Values( -#if !CONFIG_DEBLOCK_13TAP // No SIMD implementation for deblock_13tap yet - make_tuple(&aom_lpf_horizontal_16_dual_avx2, - &aom_lpf_horizontal_16_dual_c, 8), -#endif - make_tuple(&aom_lpf_horizontal_16_avx2, - &aom_lpf_horizontal_16_c, 8))); -#endif - #if HAVE_SSE2 const hbddual_loop_param_t kHbdLoop8Test9[] = { make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2, @@ -590,17 +579,6 @@ const hbddual_loop_param_t kHbdLoop8Test9[] = { INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test9Param_hbd, ::testing::ValuesIn(kHbdLoop8Test9)); -#if !CONFIG_PARALLEL_DEBLOCKING -const hbddual_loop_param_t kLoop8Test9[] = { - make_tuple(&aom_lpf_horizontal_4_dual_sse2, &aom_lpf_horizontal_4_dual_c, 8), - make_tuple(&aom_lpf_horizontal_8_dual_sse2, &aom_lpf_horizontal_8_dual_c, 8), - make_tuple(&aom_lpf_vertical_4_dual_sse2, &aom_lpf_vertical_4_dual_c, 8), - make_tuple(&aom_lpf_vertical_8_dual_sse2, &aom_lpf_vertical_8_dual_c, 8) -}; - -INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test9Param_lbd, - ::testing::ValuesIn(kLoop8Test9)); -#endif #endif // HAVE_SSE2 #if HAVE_AVX2 -- GitLab