Commit 7227b65c authored by Michael Bebenita's avatar Michael Bebenita Committed by Yaowu Xu
Browse files

Add SSE4.1 code for deringing functions.

Change-Id: I363f7fb610a5c86ea9f417e34b57c6373af877e5
parent 4713d8d0
...@@ -98,6 +98,8 @@ endif ...@@ -98,6 +98,8 @@ endif
ifeq ($(CONFIG_DERING),yes) ifeq ($(CONFIG_DERING),yes)
AV1_COMMON_SRCS-yes += common/od_dering.c AV1_COMMON_SRCS-yes += common/od_dering.c
AV1_COMMON_SRCS-yes += common/od_dering.h AV1_COMMON_SRCS-yes += common/od_dering.h
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/od_dering_sse4.c
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/od_dering_sse4.h
AV1_COMMON_SRCS-yes += common/dering.c AV1_COMMON_SRCS-yes += common/dering.c
AV1_COMMON_SRCS-yes += common/dering.h AV1_COMMON_SRCS-yes += common/dering.h
endif endif
......
...@@ -20,6 +20,7 @@ struct search_site_config; ...@@ -20,6 +20,7 @@ struct search_site_config;
struct mv; struct mv;
union int_mv; union int_mv;
struct yv12_buffer_config; struct yv12_buffer_config;
typedef int16_t od_dering_in;
EOF EOF
} }
forward_decls qw/av1_common_forward_decls/; forward_decls qw/av1_common_forward_decls/;
...@@ -840,4 +841,24 @@ if (aom_config("CONFIG_EXT_INTER") eq "yes") { ...@@ -840,4 +841,24 @@ if (aom_config("CONFIG_EXT_INTER") eq "yes") {
} }
# end encoder functions # end encoder functions
# Deringing Functions
if (aom_config("CONFIG_DERING") eq "yes") {
add_proto qw/int od_dir_find8/, "const od_dering_in *img, int stride, int32_t *var, int coeff_shift";
specialize qw/od_dir_find8 sse4_1/;
add_proto qw/int od_filter_dering_direction_4x4/, "int16_t *y, int ystride, const int16_t *in, int threshold, int dir";
specialize qw/od_filter_dering_direction_4x4 sse4_1/;
add_proto qw/int od_filter_dering_direction_8x8/, "int16_t *y, int ystride, const int16_t *in, int threshold, int dir";
specialize qw/od_filter_dering_direction_8x8 sse4_1/;
add_proto qw/void od_filter_dering_orthogonal_4x4/, "int16_t *y, int ystride, const int16_t *in, int threshold, int dir";
specialize qw/od_filter_dering_orthogonal_4x4 sse4_1/;
add_proto qw/void od_filter_dering_orthogonal_8x8/, "int16_t *y, int ystride, const int16_t *in, int threshold, int dir";
specialize qw/od_filter_dering_orthogonal_8x8 sse4_1/;
}
1; 1;
...@@ -111,7 +111,7 @@ void av1_dering_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, ...@@ -111,7 +111,7 @@ void av1_dering_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
if (pli) level = (level * 5 + 4) >> 3; if (pli) level = (level * 5 + 4) >> 3;
if (sb_all_skip(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE)) continue; if (sb_all_skip(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE)) continue;
threshold = level << coeff_shift; threshold = level << coeff_shift;
od_dering(&OD_DERING_VTBL_C, dst, MAX_MIB_SIZE * bsize[pli], od_dering(dst, MAX_MIB_SIZE * bsize[pli],
&src[pli][sbr * stride * bsize[pli] * MAX_MIB_SIZE + &src[pli][sbr * stride * bsize[pli] * MAX_MIB_SIZE +
sbc * bsize[pli] * MAX_MIB_SIZE], sbc * bsize[pli] * MAX_MIB_SIZE],
stride, nhb, nvb, sbc, sbr, nhsb, nvsb, dec[pli], dir, pli, stride, nhb, nvb, sbc, sbr, nhsb, nvsb, dec[pli], dir, pli,
......
...@@ -15,11 +15,7 @@ ...@@ -15,11 +15,7 @@
#include <stdlib.h> #include <stdlib.h>
#include <math.h> #include <math.h>
#include "dering.h" #include "dering.h"
#include "./av1_rtcd.h"
const od_dering_opt_vtbl OD_DERING_VTBL_C = {
{ od_filter_dering_direction_4x4_c, od_filter_dering_direction_8x8_c },
{ od_filter_dering_orthogonal_4x4_c, od_filter_dering_orthogonal_8x8_c }
};
/* Generated from gen_filter_tables.c. */ /* Generated from gen_filter_tables.c. */
const int OD_DIRECTION_OFFSETS_TABLE[8][3] = { const int OD_DIRECTION_OFFSETS_TABLE[8][3] = {
...@@ -42,8 +38,8 @@ const int OD_DIRECTION_OFFSETS_TABLE[8][3] = { ...@@ -42,8 +38,8 @@ const int OD_DIRECTION_OFFSETS_TABLE[8][3] = {
in a particular direction. Since each direction have the same sum(x^2) term, in a particular direction. Since each direction have the same sum(x^2) term,
that term is never computed. See Section 2, step 2, of: that term is never computed. See Section 2, step 2, of:
http://jmvalin.ca/notes/intra_paint.pdf */ http://jmvalin.ca/notes/intra_paint.pdf */
static int od_dir_find8(const od_dering_in *img, int stride, int32_t *var, int od_dir_find8_c(const od_dering_in *img, int stride, int32_t *var,
int coeff_shift) { int coeff_shift) {
int i; int i;
int32_t cost[8] = { 0 }; int32_t cost[8] = { 0 };
int partial[8][15] = { { 0 } }; int partial[8][15] = { { 0 } };
...@@ -273,9 +269,8 @@ static void od_compute_thresh(int thresh[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], ...@@ -273,9 +269,8 @@ static void od_compute_thresh(int thresh[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS],
} }
} }
void od_dering(const od_dering_opt_vtbl *vtbl, int16_t *y, int ystride, void od_dering(int16_t *y, int ystride, const od_dering_in *x, int xstride,
const od_dering_in *x, int xstride, int nhb, int nvb, int sbx, int nhb, int nvb, int sbx, int sby, int nhsb, int nvsb, int xdec,
int sby, int nhsb, int nvsb, int xdec,
int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli, int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
unsigned char *bskip, int skip_stride, int threshold, unsigned char *bskip, int skip_stride, int threshold,
int coeff_shift) { int coeff_shift) {
...@@ -289,6 +284,12 @@ void od_dering(const od_dering_opt_vtbl *vtbl, int16_t *y, int ystride, ...@@ -289,6 +284,12 @@ void od_dering(const od_dering_opt_vtbl *vtbl, int16_t *y, int ystride,
int32_t var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS]; int32_t var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS];
int thresh[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS]; int thresh[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS];
int thresh2[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS]; int thresh2[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS];
od_filter_dering_direction_func filter_dering_direction[OD_DERINGSIZES] = {
od_filter_dering_direction_4x4, od_filter_dering_direction_8x8
};
od_filter_dering_orthogonal_func filter_dering_orthogonal[OD_DERINGSIZES] = {
od_filter_dering_orthogonal_4x4, od_filter_dering_orthogonal_8x8
};
bsize = 3 - xdec; bsize = 3 - xdec;
in = inbuf + OD_FILT_BORDER * OD_FILT_BSTRIDE + OD_FILT_BORDER; in = inbuf + OD_FILT_BORDER * OD_FILT_BSTRIDE + OD_FILT_BORDER;
/* We avoid filtering the pixels for which some of the pixels to average /* We avoid filtering the pixels for which some of the pixels to average
...@@ -340,7 +341,7 @@ void od_dering(const od_dering_opt_vtbl *vtbl, int16_t *y, int ystride, ...@@ -340,7 +341,7 @@ void od_dering(const od_dering_opt_vtbl *vtbl, int16_t *y, int ystride,
to be a little bit more aggressive on pure horizontal/vertical to be a little bit more aggressive on pure horizontal/vertical
since the ringing there tends to be directional, so it doesn't since the ringing there tends to be directional, so it doesn't
get removed by the directional filtering. */ get removed by the directional filtering. */
thresh2[by][bx] = (vtbl->filter_dering_direction[bsize - OD_LOG_BSIZE0])( thresh2[by][bx] = (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
&y[(by * ystride << bsize) + (bx << bsize)], ystride, &y[(by * ystride << bsize) + (bx << bsize)], ystride,
&in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], thresh[by][bx], &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], thresh[by][bx],
dir[by][bx]); dir[by][bx]);
...@@ -354,7 +355,7 @@ void od_dering(const od_dering_opt_vtbl *vtbl, int16_t *y, int ystride, ...@@ -354,7 +355,7 @@ void od_dering(const od_dering_opt_vtbl *vtbl, int16_t *y, int ystride,
for (by = 0; by < nvb; by++) { for (by = 0; by < nvb; by++) {
for (bx = 0; bx < nhb; bx++) { for (bx = 0; bx < nhb; bx++) {
if (thresh[by][bx] == 0) continue; if (thresh[by][bx] == 0) continue;
(vtbl->filter_dering_orthogonal[bsize - OD_LOG_BSIZE0])( (filter_dering_orthogonal[bsize - OD_LOG_BSIZE0])(
&y[(by * ystride << bsize) + (bx << bsize)], ystride, &y[(by * ystride << bsize) + (bx << bsize)], ystride,
&in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], thresh2[by][bx], &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], thresh2[by][bx],
dir[by][bx]); dir[by][bx]);
......
...@@ -34,27 +34,11 @@ typedef int (*od_filter_dering_direction_func)(int16_t *y, int ystride, ...@@ -34,27 +34,11 @@ typedef int (*od_filter_dering_direction_func)(int16_t *y, int ystride,
typedef void (*od_filter_dering_orthogonal_func)(int16_t *y, int ystride, typedef void (*od_filter_dering_orthogonal_func)(int16_t *y, int ystride,
const int16_t *in, const int16_t *in,
int threshold, int dir); int threshold, int dir);
void od_dering(int16_t *y, int ystride, const od_dering_in *x, int xstride,
struct od_dering_opt_vtbl { int nvb, int nhb, int sbx, int sby, int nhsb, int nvsb, int xdec,
od_filter_dering_direction_func filter_dering_direction[OD_DERINGSIZES];
od_filter_dering_orthogonal_func filter_dering_orthogonal[OD_DERINGSIZES];
};
typedef struct od_dering_opt_vtbl od_dering_opt_vtbl;
void od_dering(const od_dering_opt_vtbl *vtbl, int16_t *y, int ystride,
const od_dering_in *x, int xstride, int nvb, int nhb, int sbx,
int sby, int nhsb, int nvsb, int xdec,
int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli, int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
unsigned char *bskip, int skip_stride, int threshold, unsigned char *bskip, int skip_stride, int threshold,
int coeff_shift); int coeff_shift);
void od_filter_dering_direction_c(int16_t *y, int ystride, const int16_t *in,
int ln, int threshold, int dir);
void od_filter_dering_orthogonal_c(int16_t *y, int ystride, const int16_t *in,
const od_dering_in *x, int xstride, int ln,
int threshold, int dir);
extern const od_dering_opt_vtbl OD_DERING_VTBL_C;
int od_filter_dering_direction_4x4_c(int16_t *y, int ystride, const int16_t *in, int od_filter_dering_direction_4x4_c(int16_t *y, int ystride, const int16_t *in,
int threshold, int dir); int threshold, int dir);
int od_filter_dering_direction_8x8_c(int16_t *y, int ystride, const int16_t *in, int od_filter_dering_direction_8x8_c(int16_t *y, int ystride, const int16_t *in,
...@@ -65,5 +49,4 @@ void od_filter_dering_orthogonal_4x4_c(int16_t *y, int ystride, ...@@ -65,5 +49,4 @@ void od_filter_dering_orthogonal_4x4_c(int16_t *y, int ystride,
void od_filter_dering_orthogonal_8x8_c(int16_t *y, int ystride, void od_filter_dering_orthogonal_8x8_c(int16_t *y, int ystride,
const int16_t *in, int threshold, const int16_t *in, int threshold,
int dir); int dir);
#endif #endif
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <smmintrin.h>
#include <emmintrin.h>
#include <tmmintrin.h>
#include "./av1_rtcd.h"
#include "av1/common/x86/od_dering_sse4.h"
/* partial A is a 16-bit vector of the form:
[x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
[0 y1 y2 y3 y4 y5 y6 y7].
This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
(x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
and const2. */
static INLINE __m128i fold_mul_and_sum(__m128i partiala, __m128i partialb,
__m128i const1, __m128i const2) {
__m128i tmp;
/* Reverse partial B. */
partialb = _mm_shuffle_epi8(
partialb,
_mm_set_epi8(15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12));
/* Interleave the x and y values of identical indices and pair x8 with 0. */
tmp = partiala;
partiala = _mm_unpacklo_epi16(partiala, partialb);
partialb = _mm_unpackhi_epi16(tmp, partialb);
/* Square and add the corresponding x and y values. */
partiala = _mm_madd_epi16(partiala, partiala);
partialb = _mm_madd_epi16(partialb, partialb);
/* Multiply by constant. */
partiala = _mm_mullo_epi32(partiala, const1);
partialb = _mm_mullo_epi32(partialb, const2);
/* Sum all results. */
partiala = _mm_add_epi32(partiala, partialb);
return partiala;
}
static INLINE __m128i hsum4(__m128i x0, __m128i x1, __m128i x2, __m128i x3) {
__m128i t0, t1, t2, t3;
t0 = _mm_unpacklo_epi32(x0, x1);
t1 = _mm_unpacklo_epi32(x2, x3);
t2 = _mm_unpackhi_epi32(x0, x1);
t3 = _mm_unpackhi_epi32(x2, x3);
x0 = _mm_unpacklo_epi64(t0, t1);
x1 = _mm_unpackhi_epi64(t0, t1);
x2 = _mm_unpacklo_epi64(t2, t3);
x3 = _mm_unpackhi_epi64(t2, t3);
return _mm_add_epi32(_mm_add_epi32(x0, x1), _mm_add_epi32(x2, x3));
}
/* Horizontal sum of 8x16-bit unsigned values. */
static INLINE int32_t hsum_epi16(__m128i a) {
a = _mm_madd_epi16(a, _mm_set1_epi16(1));
a = _mm_hadd_epi32(a, a);
a = _mm_hadd_epi32(a, a);
return _mm_cvtsi128_si32(a);
}
/* Computes cost for directions 0, 5, 6 and 7. We can call this function again
to compute the remaining directions. */
static INLINE __m128i compute_directions(__m128i lines[8],
int32_t tmp_cost1[4]) {
__m128i partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
__m128i partial6;
__m128i tmp;
/* Partial sums for lines 0 and 1. */
partial4a = _mm_slli_si128(lines[0], 14);
partial4b = _mm_srli_si128(lines[0], 2);
partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[1], 12));
partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[1], 4));
tmp = _mm_add_epi16(lines[0], lines[1]);
partial5a = _mm_slli_si128(tmp, 10);
partial5b = _mm_srli_si128(tmp, 6);
partial7a = _mm_slli_si128(tmp, 4);
partial7b = _mm_srli_si128(tmp, 12);
partial6 = tmp;
/* Partial sums for lines 2 and 3. */
partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[2], 10));
partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[2], 6));
partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[3], 8));
partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[3], 8));
tmp = _mm_add_epi16(lines[2], lines[3]);
partial5a = _mm_add_epi16(partial5a, _mm_slli_si128(tmp, 8));
partial5b = _mm_add_epi16(partial5b, _mm_srli_si128(tmp, 8));
partial7a = _mm_add_epi16(partial7a, _mm_slli_si128(tmp, 6));
partial7b = _mm_add_epi16(partial7b, _mm_srli_si128(tmp, 10));
partial6 = _mm_add_epi16(partial6, tmp);
/* Partial sums for lines 4 and 5. */
partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[4], 6));
partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[4], 10));
partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[5], 4));
partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[5], 12));
tmp = _mm_add_epi16(lines[4], lines[5]);
partial5a = _mm_add_epi16(partial5a, _mm_slli_si128(tmp, 6));
partial5b = _mm_add_epi16(partial5b, _mm_srli_si128(tmp, 10));
partial7a = _mm_add_epi16(partial7a, _mm_slli_si128(tmp, 8));
partial7b = _mm_add_epi16(partial7b, _mm_srli_si128(tmp, 8));
partial6 = _mm_add_epi16(partial6, tmp);
/* Partial sums for lines 6 and 7. */
partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[6], 2));
partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[6], 14));
partial4a = _mm_add_epi16(partial4a, lines[7]);
tmp = _mm_add_epi16(lines[6], lines[7]);
partial5a = _mm_add_epi16(partial5a, _mm_slli_si128(tmp, 4));
partial5b = _mm_add_epi16(partial5b, _mm_srli_si128(tmp, 12));
partial7a = _mm_add_epi16(partial7a, _mm_slli_si128(tmp, 10));
partial7b = _mm_add_epi16(partial7b, _mm_srli_si128(tmp, 6));
partial6 = _mm_add_epi16(partial6, tmp);
/* Compute costs in terms of partial sums. */
partial4a =
fold_mul_and_sum(partial4a, partial4b, _mm_set_epi32(210, 280, 420, 840),
_mm_set_epi32(105, 120, 140, 168));
partial7a =
fold_mul_and_sum(partial7a, partial7b, _mm_set_epi32(210, 420, 0, 0),
_mm_set_epi32(105, 105, 105, 140));
partial5a =
fold_mul_and_sum(partial5a, partial5b, _mm_set_epi32(210, 420, 0, 0),
_mm_set_epi32(105, 105, 105, 140));
partial6 = _mm_madd_epi16(partial6, partial6);
partial6 = _mm_mullo_epi32(partial6, _mm_set1_epi32(105));
partial4a = hsum4(partial4a, partial5a, partial6, partial7a);
_mm_storeu_si128((__m128i *)tmp_cost1, partial4a);
return partial4a;
}
/* transpose and reverse the order of the lines -- equivalent to a 90-degree
counter-clockwise rotation of the pixels. */
static INLINE void array_reverse_transpose_8x8(__m128i *in, __m128i *res) {
const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
res[7] = _mm_unpacklo_epi64(tr1_0, tr1_1);
res[6] = _mm_unpackhi_epi64(tr1_0, tr1_1);
res[5] = _mm_unpacklo_epi64(tr1_2, tr1_3);
res[4] = _mm_unpackhi_epi64(tr1_2, tr1_3);
res[3] = _mm_unpacklo_epi64(tr1_4, tr1_5);
res[2] = _mm_unpackhi_epi64(tr1_4, tr1_5);
res[1] = _mm_unpacklo_epi64(tr1_6, tr1_7);
res[0] = _mm_unpackhi_epi64(tr1_6, tr1_7);
}
int od_dir_find8_sse4_1(const od_dering_in *img, int stride, int32_t *var,
int coeff_shift) {
int i;
int32_t cost[8];
int32_t best_cost = 0;
int best_dir = 0;
__m128i lines[8];
__m128i dir03, dir47;
__m128i max;
for (i = 0; i < 8; i++) {
lines[i] = _mm_loadu_si128((__m128i *)&img[i * stride]);
lines[i] = _mm_sub_epi16(_mm_srai_epi16(lines[i], coeff_shift),
_mm_set1_epi16(128));
}
/* Compute "mostly vertical" directions. */
dir47 = compute_directions(lines, cost + 4);
array_reverse_transpose_8x8(lines, lines);
/* Compute "mostly horizontal" directions. */
dir03 = compute_directions(lines, cost);
#if 1
max = _mm_max_epi32(dir03, dir47);
max = _mm_max_epi32(max, _mm_shuffle_epi32(max, _MM_SHUFFLE(1, 0, 3, 2)));
max = _mm_max_epi32(max, _mm_shuffle_epi32(max, _MM_SHUFFLE(2, 3, 0, 1)));
dir03 = _mm_and_si128(_mm_cmpeq_epi32(max, dir03),
_mm_setr_epi32(-1, -2, -3, -4));
dir47 = _mm_and_si128(_mm_cmpeq_epi32(max, dir47),
_mm_setr_epi32(-5, -6, -7, -8));
dir03 = _mm_max_epu32(dir03, dir47);
dir03 = _mm_max_epu32(dir03, _mm_unpackhi_epi64(dir03, dir03));
dir03 =
_mm_max_epu32(dir03, _mm_shufflelo_epi16(dir03, _MM_SHUFFLE(1, 0, 3, 2)));
dir03 = _mm_xor_si128(dir03, _mm_set1_epi32(0xFFFFFFFF));
best_dir = _mm_cvtsi128_si32(dir03);
best_cost = _mm_cvtsi128_si32(max);
#else
for (i = 0; i < 8; i++) {
if (cost[i] > best_cost) {
best_cost = cost[i];
best_dir = i;
}
}
#endif
/* Difference between the optimal variance and the variance along the
orthogonal direction. Again, the sum(x^2) terms cancel out. */
*var = best_cost - cost[(best_dir + 4) & 7];
/* We'd normally divide by 840, but dividing by 1024 is close enough
for what we're going to do with this. */
*var >>= 10;
return best_dir;
}
static INLINE __m128i od_cmplt_abs_epi16(__m128i in, __m128i threshold) {
return _mm_cmplt_epi16(_mm_abs_epi16(in), threshold);
}
int od_filter_dering_direction_4x4_sse4_1(int16_t *y, int ystride,
const int16_t *in, int threshold,
int dir) {
int i;
__m128i sum;
__m128i p;
__m128i cmp;
__m128i row;
__m128i res;
__m128i tmp;
__m128i thresh;
__m128i total_abs;
int off1, off2;
off1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
off2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
total_abs = _mm_setzero_si128();
thresh = _mm_set1_epi16(threshold);
for (i = 0; i < 4; i += 2) {
sum = _mm_set1_epi16(0);
row = _mm_unpacklo_epi64(
_mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE]),
_mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE]));
/*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
tmp = _mm_unpacklo_epi64(
_mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE + off1]),
_mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE + off1]));
p = _mm_sub_epi16(tmp, row);
/*if (abs(p) < thresh) sum += taps[k]*p*/
cmp = od_cmplt_abs_epi16(p, thresh);
p = _mm_slli_epi16(p, 2);
p = _mm_and_si128(p, cmp);
sum = _mm_add_epi16(sum, p);
/*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
tmp = _mm_unpacklo_epi64(
_mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE - off1]),
_mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE - off1]));
p = _mm_sub_epi16(tmp, row);
/*if (abs(p) < thresh) sum += taps[k]*p1*/
cmp = od_cmplt_abs_epi16(p, thresh);
p = _mm_slli_epi16(p, 2);
p = _mm_and_si128(p, cmp);
sum = _mm_add_epi16(sum, p);
/*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
tmp = _mm_unpacklo_epi64(
_mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE + off2]),
_mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE + off2]));
p = _mm_sub_epi16(tmp, row);
/*if (abs(p) < thresh) sum += taps[k]*p*/
cmp = od_cmplt_abs_epi16(p, thresh);
p = _mm_and_si128(p, cmp);
sum = _mm_add_epi16(sum, p);
/*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
tmp = _mm_unpacklo_epi64(
_mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE - off2]),
_mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE - off2]));
p = _mm_sub_epi16(tmp, row);
/*if (abs(p) < thresh) sum += taps[k]*p1*/
cmp = od_cmplt_abs_epi16(p, thresh);
p = _mm_and_si128(p, cmp);
sum = _mm_add_epi16(sum, p);
/*res = row + ((sum + 8) >> 4)*/
res = _mm_add_epi16(sum, _mm_set1_epi16(8));
res = _mm_srai_epi16(res, 4);
total_abs = _mm_add_epi16(total_abs, _mm_abs_epi16(res));
res = _mm_add_epi16(row, res);
_mm_storel_epi64((__m128i *)&y[i * ystride], res);
_mm_storel_epi64((__m128i *)&y[(i + 1) * ystride],
_mm_unpackhi_epi64(res, res));
}
return (hsum_epi16(total_abs) + 2) >> 2;
}
int od_filter_dering_direction_8x8_sse4_1(int16_t *y, int ystride,
const int16_t *in, int threshold,
int dir) {
int i;
__m128i sum;
__m128i p;
__m128i cmp;
__m128i row;
__m128i res;
__m128i thresh;
__m128i total_abs;
int off1, off2, off3;
off1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
off2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
off3 = OD_DIRECTION_OFFSETS_TABLE[dir][2];
total_abs = _mm_setzero_si128();
thresh = _mm_set1_epi16(threshold);
for (i = 0; i < 8; i++) {
sum = _mm_set1_epi16(0);
row = _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE]);
/*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
p = _mm_sub_epi16(
_mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE + off1]), row);
/*if (abs(p) < thresh) sum += taps[k]*p*/
cmp = od_cmplt_abs_epi16(p, thresh);
p = _mm_add_epi16(p, _mm_slli_epi16(p, 1));
p = _mm_and_si128(p, cmp);
sum = _mm_add_epi16(sum, p);
/*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
p = _mm_sub_epi16(
_mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE - off1]), row);
/*if (abs(p) < thresh) sum += taps[k]*p1*/
cmp = od_cmplt_abs_epi16(p, thresh);
p = _mm_add_epi16(p, _mm_slli_epi16(p, 1));
p = _mm_and_si128(p, cmp);
sum = _mm_add_epi16(sum, p);
/*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
p = _mm_sub_epi16(
_mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE + off2]), row);
/*if (abs(p) < thresh) sum += taps[k]*p*/
cmp = od_cmplt_abs_epi16(p, thresh);
p = _mm_slli_epi16(p, 1);
p = _mm_and_si128(p, cmp);
sum = _mm_add_epi16(sum, p);
/*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
p = _mm_sub_epi16(
_mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE - off2]), row);
/*if (abs(p) < thresh) sum += taps[k]*p1*/
cmp = od_cmplt_abs_epi16(p, thresh);
p = _mm_slli_epi16(p, 1);
p = _mm_and_si128(p, cmp);
sum = _mm_add_epi16(sum, p);
/*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
p = _mm_sub_epi16(
_mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE + off3]), row);
/*if (abs(p) < thresh) sum += taps[k]*p*/
cmp = od_cmplt_abs_epi16(p, thresh);
p = _mm_and_si128(p, cmp);
sum = _mm_add_epi16(sum, p);
/*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
p = _mm_sub_epi16(
_mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE - off3]), row);
/*if (abs(p) < thresh) sum += taps[k]*p1*/
cmp = od_cmplt_abs_epi16(p, thresh);
p = _mm_and_si128(p, cmp);
sum = _mm_add_epi16(sum, p);
/*res = row + ((sum + 8) >> 4)*/
res = _mm_add_epi16(sum, _mm_set1_epi16(8));
res = _mm_srai_epi16(res, 4);
total_abs = _mm_add_epi16(total_abs, _mm_abs_epi16(res));
res = _mm_add_epi16(row, res);
_mm_storeu_si128((__m128i *)&y[i * ystride], res