diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl index 73979d0672e8f71b6fc962d2d5cc3f9209a8adf5..27e985c8db960130277033a5273e1c1031f04d08 100644 --- a/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/aom_dsp/aom_dsp_rtcd_defs.pl @@ -624,6 +624,15 @@ specialize qw/aom_lpf_horizontal_4 mmx neon dspr2 msa/; add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/; +if (aom_config("CONFIG_CLPF") eq "yes") { + add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int stride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength"; + specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/; + add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength"; + specialize qw/aom_clpf_detect sse2 ssse3 sse4_1 neon/; + add_proto qw/void aom_clpf_detect_multi/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum"; + specialize qw/aom_clpf_detect_multi sse2 ssse3 sse4_1 neon/; +} + if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { add_proto qw/void aom_highbd_lpf_vertical_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; specialize qw/aom_highbd_lpf_vertical_16 sse2/; diff --git a/av1/av1_common.mk b/av1/av1_common.mk index 03d94f28bd889997b92c014d6ee7b34025df9de5..8702b6e222e21f21afe6e27facebecddd635a370 100644 --- a/av1/av1_common.mk +++ b/av1/av1_common.mk @@ -75,6 +75,11 @@ AV1_COMMON_SRCS-$(CONFIG_AV1_ENCODER) += common/av1_fwd_txfm.c ifeq ($(CONFIG_CLPF),yes) AV1_COMMON_SRCS-yes += common/clpf.c AV1_COMMON_SRCS-yes += common/clpf.h +AV1_COMMON_SRCS-yes += common/clpf_simd.h +AV1_COMMON_SRCS-$(HAVE_SSE2) += common/clpf_sse2.c +AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/clpf_ssse3.c +AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/clpf_sse4_1.c +AV1_COMMON_SRCS-$(HAVE_NEON) += common/clpf_neon.c endif ifeq ($(CONFIG_DERING),yes) AV1_COMMON_SRCS-yes += common/od_dering.c diff --git a/av1/av1_cx.mk b/av1/av1_cx.mk index f8f37e8b72b863c2f8b43b43b931c6ac61273626..67e40213f94fa3eb149d9cde467c091e46c334bf 100644 --- a/av1/av1_cx.mk +++ b/av1/av1_cx.mk @@ -89,6 +89,11 @@ AV1_CX_SRCS-$(CONFIG_DERING) += encoder/pickdering.c ifeq ($(CONFIG_CLPF),yes) AV1_CX_SRCS-yes += encoder/clpf_rdo.c AV1_CX_SRCS-yes += encoder/clpf_rdo.h +AV1_CX_SRCS-yes += encoder/clpf_rdo_simd.h +AV1_CX_SRCS-$(HAVE_SSE2) += encoder/clpf_rdo_sse2.c +AV1_CX_SRCS-$(HAVE_SSSE3) += encoder/clpf_rdo_ssse3.c +AV1_CX_SRCS-$(HAVE_SSE4_1) += encoder/clpf_rdo_sse4_1.c +AV1_CX_SRCS-$(HAVE_NEON) += encoder/clpf_rdo_neon.c endif AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm diff --git a/av1/common/clpf.c b/av1/common/clpf.c index 861dde6dcf180ed523a609f18adaa284d0137d5e..388a7c93af2b4ea64b1664dc175c840bb04e9874 100644 --- a/av1/common/clpf.c +++ b/av1/common/clpf.c @@ -9,6 +9,7 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "av1/common/clpf.h" +#include "./aom_dsp_rtcd.h" #include "aom_dsp/aom_dsp_common.h" int av1_clpf_maxbits(const AV1_COMMON *cm) { @@ -27,9 +28,9 @@ int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b) { return (8 + delta - (delta < 0)) >> 4; } -static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0, - int y0, int sizex, int sizey, int width, int height, - unsigned int strength) { +void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int stride, int x0, + int y0, int sizex, int sizey, int width, int height, + unsigned int strength) { int x, y; for (y = y0; y < y0 + sizey; y++) { for (x = x0; x < x0 + sizex; x++) { @@ -102,8 +103,8 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec, if (!cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs] ->mbmi.skip) { // Not skip block, apply the filter - clpf_block(rec->y_buffer, dst->y_buffer, stride_y, xpos, ypos, bs, - bs, width, height, strength); + aom_clpf_block(rec->y_buffer, dst->y_buffer, stride_y, xpos, ypos, + bs, bs, width, height, strength); } else { // Skip block, copy instead for (c = 0; c < bs; c++) *(uint64_t *)(dst->y_buffer + (ypos + c) * stride_y + xpos) = diff --git a/av1/common/clpf_neon.c b/av1/common/clpf_neon.c new file mode 100644 index 0000000000000000000000000000000000000000..f1a004c2c6905ebb67823375fb962de34028d37b --- /dev/null +++ b/av1/common/clpf_neon.c @@ -0,0 +1,14 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/aom_simd.h" +#define SIMD_FUNC(name) name##_neon +#include "./clpf_simd.h" diff --git a/av1/common/clpf_simd.h b/av1/common/clpf_simd.h new file mode 100644 index 0000000000000000000000000000000000000000..faaf8ea9fe28cd23985e01415e75e9e8a7fa0481 --- /dev/null +++ b/av1/common/clpf_simd.h @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_dsp_rtcd.h" + +static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0, + int y0, int sizey, int width, int height, + unsigned int strength) { + dst += x0 + y0 * stride; + src += x0 + y0 * stride; + { + int bottom = height - 2 - y0; + const v128 sp = v128_dup_8(strength); + const v128 sm = v128_dup_8(-(int)strength); + const v128 c8 = v128_dup_8(8); + const v128 c128 = v128_dup_8(128); + + if (!x0) { // Clip left + const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL), + v64_from_64(0x0504030201000000LL)); + const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL), + v64_from_64(0x0605040302010000LL)); + int y; + + for (y = 0; y < sizey; y += 2) { + const v64 l1 = v64_load_aligned(src); + const v64 l2 = v64_load_aligned(src + stride); + v128 o = v128_from_v64(l1, l2); + const v128 x = v128_add_8(c128, o); + const v128 a = v128_add_8( + c128, + v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1)); + const v128 b = v128_shuffle_8(x, b_shuff); + const v128 c = v128_shuffle_8(x, c_shuff); + const v128 d = v128_add_8( + c128, v128_from_v64(v64_load_unaligned(src + 1), + v64_load_unaligned(src + 1 + stride))); + const v128 e = v128_add_8( + c128, v128_from_v64(v64_load_unaligned(src + 2), + v64_load_unaligned(src + 2 + stride))); + const v128 f = v128_add_8( + c128, v128_from_v64(l2, v64_load_aligned( + src + ((y != bottom) + 1) * stride))); + + const v128 tmp = + v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm), + v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm)); + const v128 delta = v128_add_8( + v128_add_8( + v128_shl_8( + v128_add_8( + v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm), + v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)), + 2), + v128_add_8( + v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm), + v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))), + v128_add_8(v128_add_8(tmp, tmp), tmp)); + o = v128_add_8( + o, v128_shr_s8( + v128_add_8(c8, v128_add_8(delta, v128_cmplt_s8( + delta, v128_zero()))), + 4)); + v64_store_aligned(dst, v128_high_v64(o)); + v64_store_aligned(dst + stride, v128_low_v64(o)); + src += stride * 2; + dst += stride * 2; + } + } else if (!(width - x0 - 8)) { // Clip right + const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL), + v64_from_64(0x0707060504030201LL)); + const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL), + v64_from_64(0x0707070605040302LL)); + int y; + + for (y = 0; y < sizey; y += 2) { + const v64 l1 = v64_load_aligned(src); + const v64 l2 = v64_load_aligned(src + stride); + v128 o = v128_from_v64(l1, l2); + const v128 x = v128_add_8(c128, o); + const v128 a = v128_add_8( + c128, + v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1)); + const v128 b = v128_add_8( + c128, v128_from_v64(v64_load_unaligned(src - 2), + v64_load_unaligned(src - 2 + stride))); + const v128 c = v128_add_8( + c128, v128_from_v64(v64_load_unaligned(src - 1), + v64_load_unaligned(src - 1 + stride))); + const v128 d = v128_shuffle_8(x, d_shuff); + const v128 e = v128_shuffle_8(x, e_shuff); + const v128 f = v128_add_8( + c128, v128_from_v64(l2, v64_load_aligned( + src + ((y != bottom) + 1) * stride))); + + const v128 tmp = + v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm), + v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm)); + const v128 delta = v128_add_8( + v128_add_8( + v128_shl_8( + v128_add_8( + v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm), + v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)), + 2), + v128_add_8( + v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm), + v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))), + v128_add_8(v128_add_8(tmp, tmp), tmp)); + o = v128_add_8( + o, v128_shr_s8( + v128_add_8(c8, v128_add_8(delta, v128_cmplt_s8( + delta, v128_zero()))), + 4)); + v64_store_aligned(dst, v128_high_v64(o)); + v64_store_aligned(dst + stride, v128_low_v64(o)); + src += stride * 2; + dst += stride * 2; + } + } else { // No left/right clipping + int y; + for (y = 0; y < sizey; y += 2) { + const v64 l1 = v64_load_aligned(src); + const v64 l2 = v64_load_aligned(src + stride); + v128 o = v128_from_v64(l1, l2); + const v128 x = v128_add_8(c128, o); + const v128 a = v128_add_8( + c128, + v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1)); + const v128 b = v128_add_8( + c128, v128_from_v64(v64_load_unaligned(src - 2), + v64_load_unaligned(src - 2 + stride))); + const v128 c = v128_add_8( + c128, v128_from_v64(v64_load_unaligned(src - 1), + v64_load_unaligned(src - 1 + stride))); + const v128 d = v128_add_8( + c128, v128_from_v64(v64_load_unaligned(src + 1), + v64_load_unaligned(src + 1 + stride))); + const v128 e = v128_add_8( + c128, v128_from_v64(v64_load_unaligned(src + 2), + v64_load_unaligned(src + 2 + stride))); + const v128 f = v128_add_8( + c128, v128_from_v64(l2, v64_load_aligned( + src + ((y != bottom) + 1) * stride))); + + const v128 tmp = + v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm), + v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm)); + const v128 delta = v128_add_8( + v128_add_8( + v128_shl_8( + v128_add_8( + v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm), + v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)), + 2), + v128_add_8( + v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm), + v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))), + v128_add_8(v128_add_8(tmp, tmp), tmp)); + o = v128_add_8( + o, v128_shr_s8( + v128_add_8(c8, v128_add_8(delta, v128_cmplt_s8( + delta, v128_zero()))), + 4)); + v64_store_aligned(dst, v128_high_v64(o)); + v64_store_aligned(dst + stride, v128_low_v64(o)); + src += stride * 2; + dst += stride * 2; + } + } + } +} + +void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int stride, + int x0, int y0, int sizex, int sizey, int width, + int height, unsigned int strength) { + // TODO(stemidts): + // A sizex different from 8 will only be needed if CLPF is extended to chroma. + // This will only be used if 4:2:0 and width not a multiple of 16 and along + // the right edge only, so we can fall back to the plain C implementation in + // this case. If not extended to chroma, this test will be redundant. + if (sizex != 8 || width < 16) { // Fallback to C if frame width < 16 + aom_clpf_block_c(src, dst, stride, x0, y0, sizex, sizey, width, height, + strength); + } else { + clpf_block(src, dst, stride, x0, y0, sizey, width, height, strength); + } +} diff --git a/av1/common/clpf_sse2.c b/av1/common/clpf_sse2.c new file mode 100644 index 0000000000000000000000000000000000000000..e29c2ab7eaf66099aabe4a1b07457868c032b349 --- /dev/null +++ b/av1/common/clpf_sse2.c @@ -0,0 +1,14 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/aom_simd.h" +#define SIMD_FUNC(name) name##_sse2 +#include "./clpf_simd.h" diff --git a/av1/common/clpf_sse4_1.c b/av1/common/clpf_sse4_1.c new file mode 100644 index 0000000000000000000000000000000000000000..537139f17a0905bc1893d31f58f3950533b94d0b --- /dev/null +++ b/av1/common/clpf_sse4_1.c @@ -0,0 +1,14 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/aom_simd.h" +#define SIMD_FUNC(name) name##_sse4_1 +#include "./clpf_simd.h" diff --git a/av1/common/clpf_ssse3.c b/av1/common/clpf_ssse3.c new file mode 100644 index 0000000000000000000000000000000000000000..d7ed8dec5dbe48deca9bad3b79b05d9ae1fb077b --- /dev/null +++ b/av1/common/clpf_ssse3.c @@ -0,0 +1,14 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/aom_simd.h" +#define SIMD_FUNC(name) name##_ssse3 +#include "./clpf_simd.h" diff --git a/av1/encoder/clpf_rdo.c b/av1/encoder/clpf_rdo.c index 7710de41a7f3e0d4298b250fb8ceeabe0d56c855..8639add13723ec616d43f71763d3225ccc2334e5 100644 --- a/av1/encoder/clpf_rdo.c +++ b/av1/encoder/clpf_rdo.c @@ -10,24 +10,25 @@ */ #include "av1/common/clpf.h" +#include "./aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "av1/common/quant_common.h" // Calculate the error of a filtered and unfiltered block -static void detect_clpf(const uint8_t *rec, const uint8_t *org, int x0, int y0, - int width, int height, int so, int stride, int *sum0, - int *sum1, unsigned int strength) { +void aom_clpf_detect_c(const uint8_t *rec, const uint8_t *org, int rstride, + int ostride, int x0, int y0, int width, int height, + int *sum0, int *sum1, unsigned int strength) { int x, y; for (y = y0; y < y0 + 8; y++) { for (x = x0; x < x0 + 8; x++) { - int O = org[y * so + x]; - int X = rec[y * stride + x]; - int A = rec[AOMMAX(0, y - 1) * stride + x]; - int B = rec[y * stride + AOMMAX(0, x - 2)]; - int C = rec[y * stride + AOMMAX(0, x - 1)]; - int D = rec[y * stride + AOMMIN(width - 1, x + 1)]; - int E = rec[y * stride + AOMMIN(width - 1, x + 2)]; - int F = rec[AOMMIN(height - 1, y + 1) * stride + x]; + int O = org[y * ostride + x]; + int X = rec[y * rstride + x]; + int A = rec[AOMMAX(0, y - 1) * rstride + x]; + int B = rec[y * rstride + AOMMAX(0, x - 2)]; + int C = rec[y * rstride + AOMMAX(0, x - 1)]; + int D = rec[y * rstride + AOMMIN(width - 1, x + 1)]; + int E = rec[y * rstride + AOMMIN(width - 1, x + 2)]; + int F = rec[AOMMIN(height - 1, y + 1) * rstride + x]; int delta = av1_clpf_sample(X, A, B, C, D, E, F, strength); int Y = X + delta; *sum0 += (O - X) * (O - X); @@ -36,21 +37,21 @@ static void detect_clpf(const uint8_t *rec, const uint8_t *org, int x0, int y0, } } -static void detect_multi_clpf(const uint8_t *rec, const uint8_t *org, int x0, - int y0, int width, int height, int so, int stride, - int *sum) { +void aom_clpf_detect_multi_c(const uint8_t *rec, const uint8_t *org, + int rstride, int ostride, int x0, int y0, + int width, int height, int *sum) { int x, y; for (y = y0; y < y0 + 8; y++) { for (x = x0; x < x0 + 8; x++) { - int O = org[y * so + x]; - int X = rec[y * stride + x]; - int A = rec[AOMMAX(0, y - 1) * stride + x]; - int B = rec[y * stride + AOMMAX(0, x - 2)]; - int C = rec[y * stride + AOMMAX(0, x - 1)]; - int D = rec[y * stride + AOMMIN(width - 1, x + 1)]; - int E = rec[y * stride + AOMMIN(width - 1, x + 2)]; - int F = rec[AOMMIN(height - 1, y + 1) * stride + x]; + int O = org[y * ostride + x]; + int X = rec[y * rstride + x]; + int A = rec[AOMMAX(0, y - 1) * rstride + x]; + int B = rec[y * rstride + AOMMAX(0, x - 2)]; + int C = rec[y * rstride + AOMMAX(0, x - 1)]; + int D = rec[y * rstride + AOMMIN(width - 1, x + 1)]; + int E = rec[y * rstride + AOMMIN(width - 1, x + 2)]; + int F = rec[AOMMIN(height - 1, y + 1) * rstride + x]; int delta1 = av1_clpf_sample(X, A, B, C, D, E, F, 1); int delta2 = av1_clpf_sample(X, A, B, C, D, E, F, 2); int delta3 = av1_clpf_sample(X, A, B, C, D, E, F, 4); @@ -77,9 +78,9 @@ int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec, const int bs = MAX_MIB_SIZE; if (!cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs] ->mbmi.skip) - detect_clpf(rec->y_buffer, org->y_buffer, xpos, ypos, rec->y_crop_width, - rec->y_crop_height, org->y_stride, rec->y_stride, &sum0, - &sum1, strength); + aom_clpf_detect(rec->y_buffer, org->y_buffer, rec->y_stride, + org->y_stride, xpos, ypos, rec->y_crop_width, + rec->y_crop_height, &sum0, &sum1, strength); } } *res = sum1 < sum0; @@ -144,9 +145,9 @@ static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec, if (!cm->mi_grid_visible[ypos / MAX_MIB_SIZE * cm->mi_stride + xpos / MAX_MIB_SIZE] ->mbmi.skip) { - detect_multi_clpf(rec->y_buffer, org->y_buffer, xpos, ypos, - rec->y_crop_width, rec->y_crop_height, org->y_stride, - rec->y_stride, sum); + aom_clpf_detect_multi(rec->y_buffer, org->y_buffer, rec->y_stride, + org->y_stride, xpos, ypos, rec->y_crop_width, + rec->y_crop_height, sum); filtered = 1; } } diff --git a/av1/encoder/clpf_rdo_neon.c b/av1/encoder/clpf_rdo_neon.c new file mode 100644 index 0000000000000000000000000000000000000000..02053c518069f40f8c93571dbbeda7cc4b681609 --- /dev/null +++ b/av1/encoder/clpf_rdo_neon.c @@ -0,0 +1,14 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/aom_simd.h" +#define SIMD_FUNC(name) name##_neon +#include "./clpf_rdo_simd.h" diff --git a/av1/encoder/clpf_rdo_simd.h b/av1/encoder/clpf_rdo_simd.h new file mode 100644 index 0000000000000000000000000000000000000000..abbbe7c071c5d993d572f977bcd58dc687ab5e58 --- /dev/null +++ b/av1/encoder/clpf_rdo_simd.h @@ -0,0 +1,504 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/aom_simd.h" + +void SIMD_FUNC(aom_clpf_detect)(const uint8_t *rec, const uint8_t *org, + int rstride, int ostride, int x0, int y0, + int width, int height, int *sum0, int *sum1, + unsigned int strength) { + ssd128_internal ssd0 = v128_ssd_u8_init(); + ssd128_internal ssd1 = v128_ssd_u8_init(); + const v128 c128 = v128_dup_8(128); + const v128 sp = v128_dup_8(strength); + const v128 sm = v128_dup_8(-(int)strength); + const int bottom = height - 2 - y0; + + rec += x0 + y0 * rstride; + org += x0 + y0 * ostride; + + if (!x0) { // Clip left + const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL), + v64_from_64(0x0504030201000000LL)); + const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL), + v64_from_64(0x0605040302010000LL)); + int y; + + for (y = 0; y < 8; y += 2) { + const v64 k1 = v64_load_aligned(org); + const v64 k2 = v64_load_aligned(org + ostride); + const v64 l1 = v64_load_aligned(rec); + const v64 l2 = v64_load_aligned(rec + rstride); + v128 o = v128_from_v64(k1, k2); + const v128 q = v128_from_v64(l1, l2); + const v128 x = v128_add_8(c128, q); + const v128 a = v128_add_8( + c128, + v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1)); + const v128 b = v128_shuffle_8(x, b_shuff); + const v128 c = v128_shuffle_8(x, c_shuff); + const v128 d = v128_add_8( + c128, v128_from_v64(v64_load_unaligned(rec + 1), + v64_load_unaligned(rec + 1 + rstride))); + const v128 e = v128_add_8( + c128, v128_from_v64(v64_load_unaligned(rec + 2), + v64_load_unaligned(rec + 2 + rstride))); + const v128 f = v128_add_8( + c128, v128_from_v64( + l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride))); + + const v128 tmp = + v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm), + v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm)); + v128 delta = v128_add_8( + v128_add_8( + v128_shl_8( + v128_add_8( + v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm), + v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)), + 2), + v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm), + v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))), + v128_add_8(v128_add_8(tmp, tmp), tmp)); + + delta = v128_shr_s8( + v128_add_8(v128_dup_8(8), + v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))), + 4); + ssd0 = v128_ssd_u8(ssd0, o, q); + ssd1 = v128_ssd_u8(ssd1, o, v128_add_8(q, delta)); + rec += rstride * 2; + org += ostride * 2; + } + } else if (!(width - x0 - 8)) { // Clip right + const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL), + v64_from_64(0x0707060504030201LL)); + const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL), + v64_from_64(0x0707070605040302LL)); + int y; + + for (y = 0; y < 8; y += 2) { + const v64 k1 = v64_load_aligned(org); + const v64 k2 = v64_load_aligned(org + ostride); + const v64 l1 = v64_load_aligned(rec); + const v64 l2 = v64_load_aligned(rec + rstride); + v128 o = v128_from_v64(k1, k2); + const v128 q = v128_from_v64(l1, l2); + const v128 x = v128_add_8(c128, q); + const v128 a = v128_add_8( + c128, + v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1)); + const v128 b = v128_add_8( + c128, v128_from_v64(v64_load_unaligned(rec - 2), + v64_load_unaligned(rec - 2 + rstride))); + const v128 c = v128_add_8( + c128, v128_from_v64(v64_load_unaligned(rec - 1), + v64_load_unaligned(rec - 1 + rstride))); + const v128 d = v128_shuffle_8(x, d_shuff); + const v128 e = v128_shuffle_8(x, e_shuff); + const v128 f = v128_add_8( + c128, v128_from_v64( + l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride))); + + const v128 tmp = + v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm), + v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm)); + v128 delta = v128_add_8( + v128_add_8( + v128_shl_8( + v128_add_8( + v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm), + v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)), + 2), + v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm), + v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))), + v128_add_8(v128_add_8(tmp, tmp), tmp)); + delta = v128_shr_s8( + v128_add_8(v128_dup_8(8), + v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))), + 4); + ssd0 = v128_ssd_u8(ssd0, o, q); + ssd1 = v128_ssd_u8(ssd1, o, v128_add_8(q, delta)); + rec += rstride * 2; + org += ostride * 2; + } + } else { // No left/right clipping + int y; + for (y = 0; y < 8; y += 2) { + const v64 k1 = v64_load_aligned(org); + const v64 k2 = v64_load_aligned(org + ostride); + const v64 l1 = v64_load_aligned(rec); + const v64 l2 = v64_load_aligned(rec + rstride); + v128 o = v128_from_v64(k1, k2); + const v128 q = v128_from_v64(l1, l2); + const v128 x = v128_add_8(c128, q); + const v128 a = v128_add_8( + c128, + v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1)); + const v128 b = v128_add_8( + c128, v128_from_v64(v64_load_unaligned(rec - 2), + v64_load_unaligned(rec - 2 + rstride))); + const v128 c = v128_add_8( + c128, v128_from_v64(v64_load_unaligned(rec - 1), + v64_load_unaligned(rec - 1 + rstride))); + const v128 d = v128_add_8( + c128, v128_from_v64(v64_load_unaligned(rec + 1), + v64_load_unaligned(rec + 1 + rstride))); + const v128 e = v128_add_8( + c128, v128_from_v64(v64_load_unaligned(rec + 2), + v64_load_unaligned(rec + 2 + rstride))); + const v128 f = v128_add_8( + c128, v128_from_v64( + l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride))); + + const v128 tmp = + v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm), + v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm)); + v128 delta = v128_add_8( + v128_add_8( + v128_shl_8( + v128_add_8( + v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm), + v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)), + 2), + v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm), + v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))), + v128_add_8(v128_add_8(tmp, tmp), tmp)); + delta = v128_shr_s8( + v128_add_8(v128_dup_8(8), + v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))), + 4); + + ssd0 = v128_ssd_u8(ssd0, o, q); + ssd1 = v128_ssd_u8(ssd1, o, v128_add_8(q, delta)); + rec += rstride * 2; + org += ostride * 2; + } + } + *sum0 += v128_ssd_u8_sum(ssd0); + *sum1 += v128_ssd_u8_sum(ssd1); +} + +// Test multiple filter strengths at once. Use a simpler filter (4 tap, every +// second line). +void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org, + int rstride, int ostride, int x0, int y0, + int width, int height, int *sum) { + const v128 c128 = v128_dup_8(128); + const v128 cp1 = v128_dup_8(1); + const v128 cm1 = v128_dup_8(-1); + const v128 cp2 = v128_dup_8(2); + const v128 cm2 = v128_dup_8(-2); + const v128 cp4 = v128_dup_8(4); + const v128 cm4 = v128_dup_8(-4); + const v128 c8 = v128_dup_8(8); + const int bottom = height - 2 - y0; + ssd128_internal ssd0 = v128_ssd_u8_init(); + ssd128_internal ssd1 = v128_ssd_u8_init(); + ssd128_internal ssd2 = v128_ssd_u8_init(); + ssd128_internal ssd3 = v128_ssd_u8_init(); + + rec += x0 + y0 * rstride; + org += x0 + y0 * ostride; + + if (!x0) { // Clip left + const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL), + v64_from_64(0x0504030201000000LL)); + const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL), + v64_from_64(0x0605040302010000LL)); + int y; + + for (y = 0; y < 8; y += 2) { + const v64 k1 = v64_load_aligned(org); + const v64 k2 = v64_load_aligned(org + ostride); + const v64 l1 = v64_load_aligned(rec); + const v64 l2 = v64_load_aligned(rec + rstride); + v128 o = v128_from_v64(k1, k2); + const v128 q = v128_from_v64(l1, l2); + const v128 x = v128_add_8(c128, q); + v128 a = v128_add_8( + c128, + v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1)); + v128 b = v128_shuffle_8(x, b_shuff); + v128 c = v128_shuffle_8(x, c_shuff); + v128 d = v128_add_8(c128, + v128_from_v64(v64_load_unaligned(rec + 1), + v64_load_unaligned(rec + 1 + rstride))); + v128 e = v128_add_8(c128, + v128_from_v64(v64_load_unaligned(rec + 2), + v64_load_unaligned(rec + 2 + rstride))); + v128 f = v128_add_8( + c128, v128_from_v64( + l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride))); + v128 tmp, delta1, delta2, delta3; + + a = v128_ssub_s8(a, x); + b = v128_ssub_s8(b, x); + c = v128_ssub_s8(c, x); + d = v128_ssub_s8(d, x); + e = v128_ssub_s8(e, x); + f = v128_ssub_s8(f, x); + tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp1), cm1), + v128_max_s8(v128_min_s8(d, cp1), cm1)); + delta1 = v128_add_8( + v128_add_8( + v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp1), cm1), + v128_max_s8(v128_min_s8(f, cp1), cm1)), + 2), + v128_add_8(v128_max_s8(v128_min_s8(b, cp1), cm1), + v128_max_s8(v128_min_s8(e, cp1), cm1))), + v128_add_8(v128_add_8(tmp, tmp), tmp)); + tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp2), cm2), + v128_max_s8(v128_min_s8(d, cp2), cm2)); + delta2 = v128_add_8( + v128_add_8( + v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp2), cm2), + v128_max_s8(v128_min_s8(f, cp2), cm2)), + 2), + v128_add_8(v128_max_s8(v128_min_s8(b, cp2), cm2), + v128_max_s8(v128_min_s8(e, cp2), cm2))), + v128_add_8(v128_add_8(tmp, tmp), tmp)); + tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp4), cm4), + v128_max_s8(v128_min_s8(d, cp4), cm4)); + delta3 = v128_add_8( + v128_add_8( + v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp4), cm4), + v128_max_s8(v128_min_s8(f, cp4), cm4)), + 2), + v128_add_8(v128_max_s8(v128_min_s8(b, cp4), cm4), + v128_max_s8(v128_min_s8(e, cp4), cm4))), + v128_add_8(v128_add_8(tmp, tmp), tmp)); + + ssd0 = v128_ssd_u8(ssd0, o, q); + ssd1 = v128_ssd_u8( + ssd1, o, + v128_add_8( + q, + v128_shr_s8( + v128_add_8(c8, v128_add_8(delta1, v128_cmplt_s8( + delta1, v128_zero()))), + 4))); + ssd2 = v128_ssd_u8( + ssd2, o, + v128_add_8( + q, + v128_shr_s8( + v128_add_8(c8, v128_add_8(delta2, v128_cmplt_s8( + delta2, v128_zero()))), + 4))); + ssd3 = v128_ssd_u8( + ssd3, o, + v128_add_8( + q, + v128_shr_s8( + v128_add_8(c8, v128_add_8(delta3, v128_cmplt_s8( + delta3, v128_zero()))), + 4))); + rec += 2 * rstride; + org += 2 * ostride; + } + } else if (!(width - x0 - 8)) { // Clip right + const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL), + v64_from_64(0x0707060504030201LL)); + const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL), + v64_from_64(0x0707070605040302LL)); + int y; + + for (y = 0; y < 8; y += 2) { + const v64 k1 = v64_load_aligned(org); + const v64 k2 = v64_load_aligned(org + ostride); + const v64 l1 = v64_load_aligned(rec); + const v64 l2 = v64_load_aligned(rec + rstride); + v128 o = v128_from_v64(k1, k2); + const v128 q = v128_from_v64(l1, l2); + const v128 x = v128_add_8(c128, q); + v128 a = v128_add_8( + c128, + v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1)); + v128 b = v128_add_8(c128, + v128_from_v64(v64_load_unaligned(rec - 2), + v64_load_unaligned(rec - 2 + rstride))); + v128 c = v128_add_8(c128, + v128_from_v64(v64_load_unaligned(rec - 1), + v64_load_unaligned(rec - 1 + rstride))); + v128 d = v128_shuffle_8(x, d_shuff); + v128 e = v128_shuffle_8(x, e_shuff); + v128 f = v128_add_8( + c128, v128_from_v64( + l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride))); + v128 tmp, delta1, delta2, delta3; + + a = v128_ssub_s8(a, x); + b = v128_ssub_s8(b, x); + c = v128_ssub_s8(c, x); + d = v128_ssub_s8(d, x); + e = v128_ssub_s8(e, x); + f = v128_ssub_s8(f, x); + tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp1), cm1), + v128_max_s8(v128_min_s8(d, cp1), cm1)); + delta1 = v128_add_8( + v128_add_8( + v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp1), cm1), + v128_max_s8(v128_min_s8(f, cp1), cm1)), + 2), + v128_add_8(v128_max_s8(v128_min_s8(b, cp1), cm1), + v128_max_s8(v128_min_s8(e, cp1), cm1))), + v128_add_8(v128_add_8(tmp, tmp), tmp)); + tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp2), cm2), + v128_max_s8(v128_min_s8(d, cp2), cm2)); + delta2 = v128_add_8( + v128_add_8( + v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp2), cm2), + v128_max_s8(v128_min_s8(f, cp2), cm2)), + 2), + v128_add_8(v128_max_s8(v128_min_s8(b, cp2), cm2), + v128_max_s8(v128_min_s8(e, cp2), cm2))), + v128_add_8(v128_add_8(tmp, tmp), tmp)); + tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp4), cm4), + v128_max_s8(v128_min_s8(d, cp4), cm4)); + delta3 = v128_add_8( + v128_add_8( + v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp4), cm4), + v128_max_s8(v128_min_s8(f, cp4), cm4)), + 2), + v128_add_8(v128_max_s8(v128_min_s8(b, cp4), cm4), + v128_max_s8(v128_min_s8(e, cp4), cm4))), + v128_add_8(v128_add_8(tmp, tmp), tmp)); + + ssd0 = v128_ssd_u8(ssd0, o, q); + ssd1 = v128_ssd_u8( + ssd1, o, + v128_add_8( + q, + v128_shr_s8( + v128_add_8(c8, v128_add_8(delta1, v128_cmplt_s8( + delta1, v128_zero()))), + 4))); + ssd2 = v128_ssd_u8( + ssd2, o, + v128_add_8( + q, + v128_shr_s8( + v128_add_8(c8, v128_add_8(delta2, v128_cmplt_s8( + delta2, v128_zero()))), + 4))); + ssd3 = v128_ssd_u8( + ssd3, o, + v128_add_8( + q, + v128_shr_s8( + v128_add_8(c8, v128_add_8(delta3, v128_cmplt_s8( + delta3, v128_zero()))), + 4))); + rec += 2 * rstride; + org += 2 * ostride; + } + } else { // No left/right clipping + int y; + for (y = 0; y < 8; y += 2) { + const v64 k1 = v64_load_aligned(org); + const v64 k2 = v64_load_aligned(org + ostride); + const v64 l1 = v64_load_aligned(rec); + const v64 l2 = v64_load_aligned(rec + rstride); + v128 o = v128_from_v64(k1, k2); + const v128 q = v128_from_v64(l1, l2); + const v128 x = v128_add_8(c128, q); + v128 a = v128_add_8( + c128, + v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1)); + v128 b = v128_add_8(c128, + v128_from_v64(v64_load_unaligned(rec - 2), + v64_load_unaligned(rec - 2 + rstride))); + v128 c = v128_add_8(c128, + v128_from_v64(v64_load_unaligned(rec - 1), + v64_load_unaligned(rec - 1 + rstride))); + v128 d = v128_add_8(c128, + v128_from_v64(v64_load_unaligned(rec + 1), + v64_load_unaligned(rec + 1 + rstride))); + v128 e = v128_add_8(c128, + v128_from_v64(v64_load_unaligned(rec + 2), + v64_load_unaligned(rec + 2 + rstride))); + v128 f = v128_add_8( + c128, v128_from_v64( + l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride))); + v128 tmp, delta1, delta2, delta3; + + a = v128_ssub_s8(a, x); + b = v128_ssub_s8(b, x); + c = v128_ssub_s8(c, x); + d = v128_ssub_s8(d, x); + e = v128_ssub_s8(e, x); + f = v128_ssub_s8(f, x); + tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp1), cm1), + v128_max_s8(v128_min_s8(d, cp1), cm1)); + delta1 = v128_add_8( + v128_add_8( + v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp1), cm1), + v128_max_s8(v128_min_s8(f, cp1), cm1)), + 2), + v128_add_8(v128_max_s8(v128_min_s8(b, cp1), cm1), + v128_max_s8(v128_min_s8(e, cp1), cm1))), + v128_add_8(v128_add_8(tmp, tmp), tmp)); + tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp2), cm2), + v128_max_s8(v128_min_s8(d, cp2), cm2)); + delta2 = v128_add_8( + v128_add_8( + v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp2), cm2), + v128_max_s8(v128_min_s8(f, cp2), cm2)), + 2), + v128_add_8(v128_max_s8(v128_min_s8(b, cp2), cm2), + v128_max_s8(v128_min_s8(e, cp2), cm2))), + v128_add_8(v128_add_8(tmp, tmp), tmp)); + tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp4), cm4), + v128_max_s8(v128_min_s8(d, cp4), cm4)); + delta3 = v128_add_8( + v128_add_8( + v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp4), cm4), + v128_max_s8(v128_min_s8(f, cp4), cm4)), + 2), + v128_add_8(v128_max_s8(v128_min_s8(b, cp4), cm4), + v128_max_s8(v128_min_s8(e, cp4), cm4))), + v128_add_8(v128_add_8(tmp, tmp), tmp)); + + ssd0 = v128_ssd_u8(ssd0, o, q); + ssd1 = v128_ssd_u8( + ssd1, o, + v128_add_8( + q, + v128_shr_s8( + v128_add_8(c8, v128_add_8(delta1, v128_cmplt_s8( + delta1, v128_zero()))), + 4))); + ssd2 = v128_ssd_u8( + ssd2, o, + v128_add_8( + q, + v128_shr_s8( + v128_add_8(c8, v128_add_8(delta2, v128_cmplt_s8( + delta2, v128_zero()))), + 4))); + ssd3 = v128_ssd_u8( + ssd3, o, + v128_add_8( + q, + v128_shr_s8( + v128_add_8(c8, v128_add_8(delta3, v128_cmplt_s8( + delta3, v128_zero()))), + 4))); + rec += 2 * rstride; + org += 2 * ostride; + } + } + sum[0] += v128_ssd_u8_sum(ssd0); + sum[1] += v128_ssd_u8_sum(ssd1); + sum[2] += v128_ssd_u8_sum(ssd2); + sum[3] += v128_ssd_u8_sum(ssd3); +} diff --git a/av1/encoder/clpf_rdo_sse2.c b/av1/encoder/clpf_rdo_sse2.c new file mode 100644 index 0000000000000000000000000000000000000000..99847c01a7efc6d71cffd64758ca44b423925121 --- /dev/null +++ b/av1/encoder/clpf_rdo_sse2.c @@ -0,0 +1,14 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/aom_simd.h" +#define SIMD_FUNC(name) name##_sse2 +#include "./clpf_rdo_simd.h" diff --git a/av1/encoder/clpf_rdo_sse4_1.c b/av1/encoder/clpf_rdo_sse4_1.c new file mode 100644 index 0000000000000000000000000000000000000000..049f5371cc1f8711c9275464ade2b3d819fcd395 --- /dev/null +++ b/av1/encoder/clpf_rdo_sse4_1.c @@ -0,0 +1,14 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/aom_simd.h" +#define SIMD_FUNC(name) name##_sse4_1 +#include "./clpf_rdo_simd.h" diff --git a/av1/encoder/clpf_rdo_ssse3.c b/av1/encoder/clpf_rdo_ssse3.c new file mode 100644 index 0000000000000000000000000000000000000000..35b23b2d2791f350a9ddbea414d9fca4ec87b79e --- /dev/null +++ b/av1/encoder/clpf_rdo_ssse3.c @@ -0,0 +1,14 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/aom_simd.h" +#define SIMD_FUNC(name) name##_ssse3 +#include "./clpf_rdo_simd.h" diff --git a/test/clpf_test.cc b/test/clpf_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..786180b6a9e9e5fa536852a8e9343e94c5dc2d57 --- /dev/null +++ b/test/clpf_test.cc @@ -0,0 +1,239 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. +*/ + +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom_ports/aom_timer.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" + +using libaom_test::ACMRandom; + +namespace { + +typedef void (*clpf_block_t)(const uint8_t *src, uint8_t *dst, int stride, + int x0, int y0, int sizex, int sizey, int width, + int height, unsigned int strength); + +typedef std::tr1::tuple + clpf_block_param_t; + +class ClpfBlockTest : public ::testing::TestWithParam { + public: + virtual ~ClpfBlockTest() {} + virtual void SetUp() { + clpf = GET_PARAM(0); + ref_clpf = GET_PARAM(1); + sizex = GET_PARAM(2); + sizey = GET_PARAM(3); + } + + virtual void TearDown() { libaom_test::ClearSystemState(); } + + protected: + int sizex; + int sizey; + clpf_block_t clpf; + clpf_block_t ref_clpf; +}; + +typedef ClpfBlockTest ClpfSpeedTest; + +TEST_P(ClpfBlockTest, TestSIMDNoMismatch) { + int w = sizex; + int h = sizey; + const int size = 32; + ACMRandom rnd(ACMRandom::DeterministicSeed()); + DECLARE_ALIGNED(16, uint8_t, s[size * size]); + DECLARE_ALIGNED(16, uint8_t, d[size * size]); + DECLARE_ALIGNED(16, uint8_t, ref_d[size * size]); + memset(ref_d, 0, size * size); + memset(d, 0, size * size); + + int error = 0; + int pos = 0; + int strength = 0; + int xpos = 0, ypos = 0; + int bits; + int level; + + // Test every combination of: + // * Input with 1-8 bits of noise + // * Noise level around every value from 0 to 255 + // * Blocks anywhere in the frame (along all egdes and also fully inside) + // * All strengths + for (level = 0; level < 256 && !error; level++) { + for (bits = 1; bits < 9 && !error; bits++) { + for (int i = 0; i < size * size; i++) + s[i] = clamp((rnd.Rand8() & ((1 << bits) - 1)) + level, 0, 255); + + for (ypos = 0; ypos < size && !error; ypos += h * !error) { + for (xpos = 0; xpos < size && !error; xpos += w * !error) { + for (strength = 0; strength < 3 && !error; strength += !error) { + ref_clpf(s, ref_d, size, xpos, ypos, w, h, size, size, + 1 << strength); + ASM_REGISTER_STATE_CHECK( + clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength)); + + for (pos = 0; pos < size * size && !error; pos++) { + error = ref_d[pos] != d[pos]; + } + } + } + } + } + } + + EXPECT_EQ(0, error) + << "Error: ClpfBlockTest, SIMD and C mismatch." << std::endl + << "First error at " << pos % size << "," << pos / size << " (" + << (int16_t)ref_d[pos] << " != " << (int16_t)d[pos] << ") " << std::endl + << "strength: " << (1 << strength) << std::endl + << "xpos: " << xpos << std::endl + << "ypos: " << ypos << std::endl + << "A=" << (pos > size ? (int16_t)s[pos - size] : -1) << std::endl + << "B=" << (pos % size - 2 >= 0 ? (int16_t)s[pos - 2] : -1) << std::endl + << "C=" << (pos % size - 1 >= 0 ? (int16_t)s[pos - 1] : -1) << std::endl + << "X=" << (int16_t)s[pos] << std::endl + << "D=" << (pos % size + 1 < size ? (int16_t)s[pos + 1] : -1) << std::endl + << "E=" << (pos % size + 2 < size ? (int16_t)s[pos + 2] : -1) << std::endl + << "F=" << (pos + size < size * size ? (int16_t)s[pos + size] : -1) + << std::endl; +} + +TEST_P(ClpfSpeedTest, TestSpeed) { + int w = sizex; + int h = sizey; + const int size = 32; + ACMRandom rnd(ACMRandom::DeterministicSeed()); + DECLARE_ALIGNED(16, uint8_t, s[size * size]); + DECLARE_ALIGNED(16, uint8_t, d[size * size]); + + int strength; + int xpos, ypos; + + for (int i = 0; i < size * size; i++) s[i] = rnd.Rand8(); + + aom_usec_timer ref_timer; + aom_usec_timer timer; + + aom_usec_timer_start(&ref_timer); + for (int c = 0; c < 65536; c++) { + for (ypos = 0; ypos < size; ypos += h) { + for (xpos = 0; xpos < size; xpos += w) { + for (strength = 0; strength < 3; strength++) { + ref_clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength); + } + } + } + } + aom_usec_timer_mark(&ref_timer); + int ref_elapsed_time = aom_usec_timer_elapsed(&ref_timer); + + aom_usec_timer_start(&timer); + for (int c = 0; c < 65536; c++) { + for (ypos = 0; ypos < size; ypos += h) { + for (xpos = 0; xpos < size; xpos += w) { + for (strength = 0; strength < 3; strength++) { + clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength); + } + } + } + } + aom_usec_timer_mark(&timer); + int elapsed_time = aom_usec_timer_elapsed(&timer); + +#if 0 + std::cout << "[ ] C time = " << ref_elapsed_time / 1000 + << " ms, SIMD time = " << elapsed_time / 1000 << " ms" << std::endl; +#endif + + EXPECT_GT(ref_elapsed_time, elapsed_time) + << "Error: ClpfSpeedTest, SIMD slower than C." << std::endl + << "C time: " << ref_elapsed_time << "ms" << std::endl + << "SIMD time: " << elapsed_time << "ms" << std::endl; +} + +using std::tr1::make_tuple; + +// Test all supported architectures and block sizes +#if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P( + SSE2, ClpfBlockTest, + ::testing::Values(make_tuple(&aom_clpf_block_sse2, &aom_clpf_block_c, 8, 8), + make_tuple(&aom_clpf_block_sse2, &aom_clpf_block_c, 8, 4), + make_tuple(&aom_clpf_block_sse2, &aom_clpf_block_c, 4, 8), + make_tuple(&aom_clpf_block_sse2, &aom_clpf_block_c, 4, + 4))); +#endif + +#if HAVE_SSSE3 +INSTANTIATE_TEST_CASE_P( + SSSE3, ClpfBlockTest, + ::testing::Values( + make_tuple(&aom_clpf_block_ssse3, &aom_clpf_block_c, 8, 8), + make_tuple(&aom_clpf_block_ssse3, &aom_clpf_block_c, 8, 4), + make_tuple(&aom_clpf_block_ssse3, &aom_clpf_block_c, 4, 8), + make_tuple(&aom_clpf_block_ssse3, &aom_clpf_block_c, 4, 4))); +#endif + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_CASE_P( + SSSE4_1, ClpfBlockTest, + ::testing::Values( + make_tuple(&aom_clpf_block_sse4_1, &aom_clpf_block_c, 8, 8), + make_tuple(&aom_clpf_block_sse4_1, &aom_clpf_block_c, 8, 4), + make_tuple(&aom_clpf_block_sse4_1, &aom_clpf_block_c, 4, 8), + make_tuple(&aom_clpf_block_sse4_1, &aom_clpf_block_c, 4, 4))); +#endif + +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P( + NEON, ClpfBlockTest, + ::testing::Values(make_tuple(&aom_clpf_block_neon, &aom_clpf_block_c, 8, 8), + make_tuple(&aom_clpf_block_neon, &aom_clpf_block_c, 8, 4), + make_tuple(&aom_clpf_block_neon, &aom_clpf_block_c, 4, 8), + make_tuple(&aom_clpf_block_neon, &aom_clpf_block_c, 4, + 4))); +#endif + +// Test speed for all supported architectures +#if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P(SSE2, ClpfSpeedTest, + ::testing::Values(make_tuple(&aom_clpf_block_sse2, + &aom_clpf_block_c, 8, 8))); +#endif + +#if HAVE_SSSE3 +INSTANTIATE_TEST_CASE_P(SSSE3, ClpfSpeedTest, + ::testing::Values(make_tuple(&aom_clpf_block_ssse3, + &aom_clpf_block_c, 8, 8))); +#endif + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_CASE_P(SSSE4_1, ClpfSpeedTest, + ::testing::Values(make_tuple(&aom_clpf_block_ssse3, + &aom_clpf_block_c, 8, 8))); +#endif + +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P(NEON, ClpfSpeedTest, + ::testing::Values(make_tuple(&aom_clpf_block_neon, + &aom_clpf_block_c, 8, 8))); +#endif +} // namespace diff --git a/test/test.mk b/test/test.mk index 8dcc7bb1fcdcc00d94d8fc842741f770d5b42942..3575810388ba625fe02bf6fefc04c28efc23c13b 100644 --- a/test/test.mk +++ b/test/test.mk @@ -104,6 +104,7 @@ LIBAOM_TEST_SRCS-yes += convolve_test.cc LIBAOM_TEST_SRCS-yes += convolve_test.cc LIBAOM_TEST_SRCS-yes += av1_convolve_test.cc LIBAOM_TEST_SRCS-yes += lpf_8_test.cc +LIBAOM_TEST_SRCS-$(CONFIG_CLPF) += clpf_test.cc LIBAOM_TEST_SRCS-yes += intrapred_test.cc LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += dct16x16_test.cc LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += dct32x32_test.cc