Commit 4d2af5db authored by David Barker's avatar David Barker Committed by Debargha Mukherjee
Browse files

Add SSE4.1 highbitdepth self-guided filter

Performance is very similar to the lowbd path (only 4-5% slower)

Change-Id: Ifdb272c3f6c0e6f41e7046cc49497c72b5a796d9
parent 0a4bc8d3
......@@ -780,18 +780,18 @@ if ((aom_config("CONFIG_WARPED_MOTION") eq "yes") ||
# LOOP_RESTORATION functions
if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
add_proto qw/void apply_selfguided_restoration/, "uint8_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf";
add_proto qw/void apply_selfguided_restoration/, "uint8_t *dat, int width, int height, int stride, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf";
specialize qw/apply_selfguided_restoration sse4_1/;
add_proto qw/void av1_selfguided_restoration/, "uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps, int32_t *tmpbuf";
add_proto qw/void av1_selfguided_restoration/, "uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps, int32_t *tmpbuf";
specialize qw/av1_selfguided_restoration sse4_1/;
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto qw/void apply_selfguided_restoration_highbd/, "uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf";
specialize qw/apply_selfguided_restoration_highbd/;
specialize qw/apply_selfguided_restoration_highbd sse4_1/;
add_proto qw/void av1_selfguided_restoration_highbd/, "uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps, int32_t *tmpbuf";
specialize qw/av1_selfguided_restoration_highbd/;
specialize qw/av1_selfguided_restoration_highbd sse4_1/;
}
}
......
......@@ -731,22 +731,20 @@ static void av1_selfguided_restoration_internal(int32_t *dgd, int width,
void av1_selfguided_restoration_c(uint8_t *dgd, int width, int height,
int stride, int32_t *dst, int dst_stride,
int bit_depth, int r, int eps,
int32_t *tmpbuf) {
int r, int eps, int32_t *tmpbuf) {
int i, j;
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
dst[i * dst_stride + j] = dgd[i * stride + j];
}
}
av1_selfguided_restoration_internal(dst, width, height, dst_stride, bit_depth,
r, eps, tmpbuf);
av1_selfguided_restoration_internal(dst, width, height, dst_stride, 8, r, eps,
tmpbuf);
}
void apply_selfguided_restoration_c(uint8_t *dat, int width, int height,
int stride, int bit_depth, int eps,
int *xqd, uint8_t *dst, int dst_stride,
int32_t *tmpbuf) {
int stride, int eps, int *xqd, uint8_t *dst,
int dst_stride, int32_t *tmpbuf) {
int xq[2];
int32_t *flt1 = tmpbuf;
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
......@@ -754,11 +752,9 @@ void apply_selfguided_restoration_c(uint8_t *dat, int width, int height,
int i, j;
assert(width * height <= RESTORATION_TILEPELS_MAX);
av1_selfguided_restoration_c(dat, width, height, stride, flt1, width,
bit_depth, sgr_params[eps].r1,
sgr_params[eps].e1, tmpbuf2);
sgr_params[eps].r1, sgr_params[eps].e1, tmpbuf2);
av1_selfguided_restoration_c(dat, width, height, stride, flt2, width,
bit_depth, sgr_params[eps].r2,
sgr_params[eps].e2, tmpbuf2);
sgr_params[eps].r2, sgr_params[eps].e2, tmpbuf2);
decode_xq(xqd, xq);
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
......@@ -796,7 +792,7 @@ static void loop_sgrproj_filter_tile(uint8_t *data, int tile_idx, int width,
data_p = data + h_start + v_start * stride;
dst_p = dst + h_start + v_start * dst_stride;
apply_selfguided_restoration(data_p, h_end - h_start, v_end - v_start, stride,
8, rst->rsi->sgrproj_info[tile_idx].ep,
rst->rsi->sgrproj_info[tile_idx].ep,
rst->rsi->sgrproj_info[tile_idx].xqd, dst_p,
dst_stride, rst->tmpbuf);
}
......
......@@ -11,22 +11,26 @@
static void calc_block(__m128i sum, __m128i sum_sq, __m128i n,
__m128i one_over_n, __m128i s, int bit_depth, int idx,
int32_t *A, int32_t *B) {
__m128i a, b;
__m128i a, b, p;
#if CONFIG_AOM_HIGHBITDEPTH
__m128i rounding_a = _mm_set1_epi32((1 << (2 * (bit_depth - 8))) >> 1);
__m128i rounding_b = _mm_set1_epi32((1 << (bit_depth - 8)) >> 1);
a = _mm_srl_epi32(_mm_add_epi32(sum_sq, rounding_a),
_mm_set1_epi32(2 * (bit_depth - 8)));
b = _mm_srl_epi32(_mm_add_epi32(sum, rounding_b),
_mm_set1_epi32(bit_depth - 8));
a = _mm_mullo_epi32(a, n);
b = _mm_mullo_epi32(b, b);
__m128i p = _mm_sub_epi32(_mm_max_epi32(a, b), b);
#else
(void)bit_depth;
a = _mm_mullo_epi32(sum_sq, n);
b = _mm_mullo_epi32(sum, sum);
__m128i p = _mm_sub_epi32(a, b);
if (bit_depth > 8) {
__m128i rounding_a = _mm_set1_epi32((1 << (2 * (bit_depth - 8))) >> 1);
__m128i rounding_b = _mm_set1_epi32((1 << (bit_depth - 8)) >> 1);
a = _mm_srl_epi32(_mm_add_epi32(sum_sq, rounding_a),
_mm_set1_epi32(2 * (bit_depth - 8)));
b = _mm_srl_epi32(_mm_add_epi32(sum, rounding_b),
_mm_set1_epi32(bit_depth - 8));
a = _mm_mullo_epi32(a, n);
b = _mm_mullo_epi32(b, b);
p = _mm_sub_epi32(_mm_max_epi32(a, b), b);
} else {
#endif
(void)bit_depth;
a = _mm_mullo_epi32(sum_sq, n);
b = _mm_mullo_epi32(sum, sum);
p = _mm_sub_epi32(a, b);
#if CONFIG_AOM_HIGHBITDEPTH
}
#endif
__m128i rounding_z = _mm_set1_epi32((1 << SGRPROJ_MTABLE_BITS) >> 1);
......@@ -53,9 +57,9 @@ static void calc_block(__m128i sum, __m128i sum_sq, __m128i n,
_mm_storeu_si128((__m128i *)&B[idx], b_res);
}
static void selfguided_restoration_1(uint8_t *src, int width, int height,
int src_stride, int eps, int bit_depth,
int32_t *A, int32_t *B, int buf_stride) {
static void selfguided_restoration_1_v(uint8_t *src, int width, int height,
int src_stride, int32_t *A, int32_t *B,
int buf_stride) {
int i, j;
// Vertical sum
......@@ -110,8 +114,15 @@ static void selfguided_restoration_1(uint8_t *src, int width, int height,
_mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
_mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
}
}
static void selfguided_restoration_1_h(int32_t *A, int32_t *B, int width,
int height, int buf_stride, int eps,
int bit_depth) {
int i, j;
// Horizontal sum
int width_extend = (width + 3) & ~3;
for (i = 0; i < height; ++i) {
int h = AOMMIN(2, height - i) + AOMMIN(1, i);
......@@ -220,9 +231,9 @@ static void selfguided_restoration_1(uint8_t *src, int width, int height,
}
}
static void selfguided_restoration_2(uint8_t *src, int width, int height,
int src_stride, int eps, int bit_depth,
int32_t *A, int32_t *B, int buf_stride) {
static void selfguided_restoration_2_v(uint8_t *src, int width, int height,
int src_stride, int32_t *A, int32_t *B,
int buf_stride) {
int i, j;
// Vertical sum
......@@ -295,8 +306,15 @@ static void selfguided_restoration_2(uint8_t *src, int width, int height,
_mm_store_si128((__m128i *)&B[(i + 2) * buf_stride + j], sum);
_mm_store_si128((__m128i *)&A[(i + 2) * buf_stride + j], sum_sq);
}
}
static void selfguided_restoration_2_h(int32_t *A, int32_t *B, int width,
int height, int buf_stride, int eps,
int bit_depth) {
int i, j;
// Horizontal sum
int width_extend = (width + 3) & ~3;
for (i = 0; i < height; ++i) {
int h = AOMMIN(3, height - i) + AOMMIN(2, i);
......@@ -414,9 +432,9 @@ static void selfguided_restoration_2(uint8_t *src, int width, int height,
}
}
static void selfguided_restoration_3(uint8_t *src, int width, int height,
int src_stride, int eps, int bit_depth,
int32_t *A, int32_t *B, int buf_stride) {
static void selfguided_restoration_3_v(uint8_t *src, int width, int height,
int src_stride, int32_t *A, int32_t *B,
int buf_stride) {
int i, j;
// Vertical sum over 7-pixel regions, 4 columns at a time
......@@ -507,8 +525,14 @@ static void selfguided_restoration_3(uint8_t *src, int width, int height,
_mm_store_si128((__m128i *)&B[(i + 3) * buf_stride + j], sum);
_mm_store_si128((__m128i *)&A[(i + 3) * buf_stride + j], sum_sq);
}
}
static void selfguided_restoration_3_h(int32_t *A, int32_t *B, int width,
int height, int buf_stride, int eps,
int bit_depth) {
int i, j;
// Horizontal sum over 7-pixel regions of dst
int width_extend = (width + 3) & ~3;
for (i = 0; i < height; ++i) {
int h = AOMMIN(4, height - i) + AOMMIN(3, i);
......@@ -641,8 +665,7 @@ static void selfguided_restoration_3(uint8_t *src, int width, int height,
void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
int stride, int32_t *dst, int dst_stride,
int bit_depth, int r, int eps,
int32_t *tmpbuf) {
int r, int eps, int32_t *tmpbuf) {
int32_t *A = tmpbuf;
int32_t *B = A + SGRPROJ_OUTBUF_SIZE;
int i, j;
......@@ -655,14 +678,14 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
if ((width < 5) || (height < 5)) return;
if (r == 1) {
selfguided_restoration_1(dgd, width, height, stride, eps, bit_depth, A, B,
buf_stride);
selfguided_restoration_1_v(dgd, width, height, stride, A, B, buf_stride);
selfguided_restoration_1_h(A, B, width, height, buf_stride, eps, 8);
} else if (r == 2) {
selfguided_restoration_2(dgd, width, height, stride, eps, bit_depth, A, B,
buf_stride);
selfguided_restoration_2_v(dgd, width, height, stride, A, B, buf_stride);
selfguided_restoration_2_h(A, B, width, height, buf_stride, eps, 8);
} else if (r == 3) {
selfguided_restoration_3(dgd, width, height, stride, eps, bit_depth, A, B,
buf_stride);
selfguided_restoration_3_v(dgd, width, height, stride, A, B, buf_stride);
selfguided_restoration_3_h(A, B, width, height, buf_stride, eps, 8);
} else {
assert(0);
}
......@@ -861,8 +884,8 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
}
void apply_selfguided_restoration_sse4_1(uint8_t *dat, int width, int height,
int stride, int bit_depth, int eps,
int *xqd, uint8_t *dst, int dst_stride,
int stride, int eps, int *xqd,
uint8_t *dst, int dst_stride,
int32_t *tmpbuf) {
int xq[2];
int32_t *flt1 = tmpbuf;
......@@ -870,20 +893,12 @@ void apply_selfguided_restoration_sse4_1(uint8_t *dat, int width, int height,
int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
int i, j;
assert(width * height <= RESTORATION_TILEPELS_MAX);
// The SSE4.1 code does not currently support highbitdepth, so drop back
// to the C filter in that case.
// TODO(david.barker): Allow bit_depth > 8 in the SSE4.1 code.
if (bit_depth != 8) {
apply_selfguided_restoration_c(dat, width, height, stride, bit_depth, eps,
xqd, dst, dst_stride, tmpbuf);
return;
}
av1_selfguided_restoration_sse4_1(dat, width, height, stride, flt1, width,
bit_depth, sgr_params[eps].r1,
sgr_params[eps].e1, tmpbuf2);
sgr_params[eps].r1, sgr_params[eps].e1,
tmpbuf2);
av1_selfguided_restoration_sse4_1(dat, width, height, stride, flt2, width,
bit_depth, sgr_params[eps].r2,
sgr_params[eps].e2, tmpbuf2);
sgr_params[eps].r2, sgr_params[eps].e2,
tmpbuf2);
decode_xq(xqd, xq);
__m128i xq0 = _mm_set1_epi32(xq[0]);
......@@ -943,3 +958,544 @@ void apply_selfguided_restoration_sse4_1(uint8_t *dat, int width, int height,
}
}
}
#if CONFIG_AOM_HIGHBITDEPTH
// Only the vertical sums need to be adjusted for highbitdepth
static void highbd_selfguided_restoration_1_v(uint16_t *src, int width,
int height, int src_stride,
int32_t *A, int32_t *B,
int buf_stride) {
int i, j;
int width_extend = (width + 3) & ~3;
for (j = 0; j < width_extend; j += 4) {
__m128i a, b, x, y, x2, y2;
__m128i sum, sum_sq, tmp;
a = _mm_loadl_epi64((__m128i *)&src[j]);
b = _mm_loadl_epi64((__m128i *)&src[src_stride + j]);
sum = _mm_cvtepi16_epi32(_mm_add_epi16(a, b));
tmp = _mm_unpacklo_epi16(a, b);
sum_sq = _mm_madd_epi16(tmp, tmp);
_mm_store_si128((__m128i *)&B[j], sum);
_mm_store_si128((__m128i *)&A[j], sum_sq);
x = _mm_cvtepu16_epi32(
_mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]));
sum = _mm_add_epi32(sum, x);
x2 = _mm_mullo_epi32(x, x);
sum_sq = _mm_add_epi32(sum_sq, x2);
for (i = 1; i < height - 2; ++i) {
_mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
_mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
x = _mm_cvtepu16_epi32(
_mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
y = _mm_cvtepu16_epi32(
_mm_loadl_epi64((__m128i *)&src[(i + 2) * src_stride + j]));
sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
x2 = _mm_mullo_epi32(x, x);
y2 = _mm_mullo_epi32(y, y);
sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
}
_mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
_mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
x = _mm_cvtepu16_epi32(
_mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
sum = _mm_sub_epi32(sum, x);
x2 = _mm_mullo_epi32(x, x);
sum_sq = _mm_sub_epi32(sum_sq, x2);
_mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
_mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
}
}
static void highbd_selfguided_restoration_2_v(uint16_t *src, int width,
int height, int src_stride,
int32_t *A, int32_t *B,
int buf_stride) {
int i, j;
int width_extend = (width + 3) & ~3;
for (j = 0; j < width_extend; j += 4) {
__m128i a, b, c, c2, x, y, x2, y2;
__m128i sum, sum_sq, tmp;
a = _mm_loadl_epi64((__m128i *)&src[j]);
b = _mm_loadl_epi64((__m128i *)&src[src_stride + j]);
c = _mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]);
sum = _mm_cvtepi16_epi32(_mm_add_epi16(_mm_add_epi16(a, b), c));
// Important: We need to widen *before* squaring here, since
// c^2 may be up to 2^24.
c = _mm_cvtepu16_epi32(c);
c2 = _mm_mullo_epi32(c, c);
tmp = _mm_unpacklo_epi16(a, b);
sum_sq = _mm_add_epi32(_mm_madd_epi16(tmp, tmp), c2);
_mm_store_si128((__m128i *)&B[j], sum);
_mm_store_si128((__m128i *)&A[j], sum_sq);
x = _mm_cvtepu16_epi32(
_mm_loadl_epi64((__m128i *)&src[3 * src_stride + j]));
sum = _mm_add_epi32(sum, x);
x2 = _mm_mullo_epi32(x, x);
sum_sq = _mm_add_epi32(sum_sq, x2);
_mm_store_si128((__m128i *)&B[buf_stride + j], sum);
_mm_store_si128((__m128i *)&A[buf_stride + j], sum_sq);
x = _mm_cvtepu16_epi32(
_mm_loadl_epi64((__m128i *)&src[4 * src_stride + j]));
sum = _mm_add_epi32(sum, x);
x2 = _mm_mullo_epi32(x, x);
sum_sq = _mm_add_epi32(sum_sq, x2);
for (i = 2; i < height - 3; ++i) {
_mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
_mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
x = _mm_cvtepu16_epi32(
_mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j]));
y = _mm_cvtepu16_epi32(
_mm_loadl_epi64((__m128i *)&src[(i + 3) * src_stride + j]));
sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
x2 = _mm_mullo_epi32(x, x);
y2 = _mm_mullo_epi32(y, y);
sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
}
_mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
_mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
x = _mm_cvtepu16_epi32(
_mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j]));
sum = _mm_sub_epi32(sum, x);
x2 = _mm_mullo_epi32(x, x);
sum_sq = _mm_sub_epi32(sum_sq, x2);
_mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
_mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
x = _mm_cvtepu16_epi32(
_mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
sum = _mm_sub_epi32(sum, x);
x2 = _mm_mullo_epi32(x, x);
sum_sq = _mm_sub_epi32(sum_sq, x2);
_mm_store_si128((__m128i *)&B[(i + 2) * buf_stride + j], sum);
_mm_store_si128((__m128i *)&A[(i + 2) * buf_stride + j], sum_sq);
}
}
static void highbd_selfguided_restoration_3_v(uint16_t *src, int width,
int height, int src_stride,
int32_t *A, int32_t *B,
int buf_stride) {
int i, j;
int width_extend = (width + 3) & ~3;
for (j = 0; j < width_extend; j += 4) {
__m128i a, b, c, d, x, y, x2, y2;
__m128i sum, sum_sq, tmp, tmp2;
a = _mm_loadl_epi64((__m128i *)&src[j]);
b = _mm_loadl_epi64((__m128i *)&src[src_stride + j]);
c = _mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]);
d = _mm_loadl_epi64((__m128i *)&src[3 * src_stride + j]);
sum = _mm_cvtepi16_epi32(
_mm_add_epi16(_mm_add_epi16(a, b), _mm_add_epi16(c, d)));
tmp = _mm_unpacklo_epi16(a, b);
tmp2 = _mm_unpacklo_epi16(c, d);
sum_sq =
_mm_add_epi32(_mm_madd_epi16(tmp, tmp), _mm_madd_epi16(tmp2, tmp2));
_mm_store_si128((__m128i *)&B[j], sum);
_mm_store_si128((__m128i *)&A[j], sum_sq);
x = _mm_cvtepu16_epi32(
_mm_loadl_epi64((__m128i *)&src[4 * src_stride + j]));
sum = _mm_add_epi32(sum, x);
x2 = _mm_mullo_epi32(x, x);
sum_sq = _mm_add_epi32(sum_sq, x2);
_mm_store_si128((__m128i *)&B[buf_stride + j], sum);
_mm_store_si128((__m128i *)&A[buf_stride + j], sum_sq);
x = _mm_cvtepu16_epi32(
_mm_loadl_epi64((__m128i *)&src[5 * src_stride + j]));
sum = _mm_add_epi32(sum, x);
x2 = _mm_mullo_epi32(x, x);
sum_sq = _mm_add_epi32(sum_sq, x2);
_mm_store_si128((__m128i *)&B[2 * buf_stride + j], sum);
_mm_store_si128((__m128i *)&A[2 * buf_stride + j], sum_sq);
x = _mm_cvtepu16_epi32(
_mm_loadl_epi64((__m128i *)&src[6 * src_stride + j]));
sum = _mm_add_epi32(sum, x);
x2 = _mm_mullo_epi32(x, x);
sum_sq = _mm_add_epi32(sum_sq, x2);
for (i = 3; i < height - 4; ++i) {
_mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
_mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
x = _mm_cvtepu16_epi32(
_mm_loadl_epi64((__m128i *)&src[(i - 3) * src_stride + j]));
y = _mm_cvtepu16_epi32(
_mm_loadl_epi64((__m128i *)&src[(i + 4) * src_stride + j]));
sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
x2 = _mm_mullo_epi32(x, x);
y2 = _mm_mullo_epi32(y, y);
sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
}
_mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
_mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
x = _mm_cvtepu16_epi32(
_mm_loadl_epi64((__m128i *)&src[(i - 3) * src_stride + j]));
sum = _mm_sub_epi32(sum, x);
x2 = _mm_mullo_epi32(x, x);
sum_sq = _mm_sub_epi32(sum_sq, x2);
_mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
_mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
x = _mm_cvtepu16_epi32(
_mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j]));
sum = _mm_sub_epi32(sum, x);
x2 = _mm_mullo_epi32(x, x);
sum_sq = _mm_sub_epi32(sum_sq, x2);
_mm_store_si128((__m128i *)&B[(i + 2) * buf_stride + j], sum);
_mm_store_si128((__m128i *)&A[(i + 2) * buf_stride + j], sum_sq);
x = _mm_cvtepu16_epi32(
_mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
sum = _mm_sub_epi32(sum, x);
x2 = _mm_mullo_epi32(x, x);
sum_sq = _mm_sub_epi32(sum_sq, x2);
_mm_store_si128((__m128i *)&B[(i + 3) * buf_stride + j], sum);
_mm_store_si128((__m128i *)&A[(i + 3) * buf_stride + j], sum_sq);
}
}
void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
int height, int stride,
int32_t *dst, int dst_stride,
int bit_depth, int r, int eps,
int32_t *tmpbuf) {
int32_t *A = tmpbuf;
int32_t *B = A + SGRPROJ_OUTBUF_SIZE;
int i, j;
// Adjusting the stride of A and B here appears to avoid bad cache effects,
// leading to a significant speed improvement.
// We also align the stride to a multiple of 16 bytes for efficiency.
int buf_stride = ((width + 3) & ~3) + 16;
// Don't filter tiles with dimensions < 5 on any axis
if ((width < 5) || (height < 5)) return;
if (r == 1) {
highbd_selfguided_restoration_1_v(dgd, width, height, stride, A, B,
buf_stride);
selfguided_restoration_1_h(A, B, width, height, buf_stride, eps, bit_depth);
} else if (r == 2) {
highbd_selfguided_restoration_2_v(dgd, width, height, stride, A, B,
buf_stride);
selfguided_restoration_2_h(A, B, width, height, buf_stride, eps, bit_depth);
} else if (r == 3) {
highbd_selfguided_restoration_3_v(dgd, width, height, stride, A, B,
buf_stride);
selfguided_restoration_3_h(A, B, width, height, buf_stride, eps, bit_depth);
} else {
assert(0);
}
{
i = 0;
j = 0;
{
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] +
A[k + buf_stride + 1];
const int32_t b = 3 * B[k] + 2 * B[k + 1] + 2 * B[k + buf_stride] +
B[k + buf_stride + 1];
const int32_t v = a * dgd[l] + b;
dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
}
for (j = 1; j < width - 1; ++j) {
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
A[k + buf_stride - 1] + A[k + buf_stride + 1];
const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k + buf_stride] +
B[k + buf_stride - 1] + B[k + buf_stride + 1];
const int32_t v = a * dgd[l] + b;
dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
}
j = width - 1;
{
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int m = i * dst_stride + j;
const int nb =