Commit 7cf60961 authored by Rupert Swarbrick's avatar Rupert Swarbrick

Remove unused highpass filter from SGR code

Change-Id: Ifac3a3bf620061865b82b986d6b16bcabd96a187
parent 064c1d47
......@@ -557,18 +557,12 @@ if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
add_proto qw/void av1_selfguided_restoration/, "const uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps";
specialize qw/av1_selfguided_restoration sse4_1/;
add_proto qw/void av1_highpass_filter/, "const uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps";
specialize qw/av1_highpass_filter sse4_1/;
if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
add_proto qw/void apply_selfguided_restoration_highbd/, "const uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, const int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf";
specialize qw/apply_selfguided_restoration_highbd sse4_1/;
add_proto qw/void av1_selfguided_restoration_highbd/, "const uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps";
specialize qw/av1_selfguided_restoration_highbd sse4_1/;
add_proto qw/void av1_highpass_filter_highbd/, "const uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps";
specialize qw/av1_highpass_filter_highbd sse4_1/;
}
}
......
......@@ -26,13 +26,6 @@
#include "aom_ports/mem.h"
const sgr_params_type sgr_params[SGRPROJ_PARAMS] = {
#if USE_HIGHPASS_IN_SGRPROJ
// corner, edge, r2, eps2
{ -1, 2, 1, 1 }, { -1, 2, 1, 2 }, { -1, 2, 1, 3 }, { -1, 2, 1, 4 },
{ -1, 2, 1, 5 }, { -2, 3, 1, 2 }, { -2, 3, 1, 3 }, { -2, 3, 1, 4 },
{ -2, 3, 1, 5 }, { -2, 3, 1, 6 }, { -3, 4, 1, 3 }, { -3, 4, 1, 4 },
{ -3, 4, 1, 5 }, { -3, 4, 1, 6 }, { -3, 4, 1, 7 }, { -3, 4, 1, 8 }
#else
// r1, eps1, r2, eps2
#if MAX_RADIUS == 2
{ 2, 12, 1, 4 }, { 2, 15, 1, 6 }, { 2, 18, 1, 8 }, { 2, 20, 1, 9 },
......@@ -45,7 +38,6 @@ const sgr_params_type sgr_params[SGRPROJ_PARAMS] = {
{ 2, 55, 1, 14 }, { 2, 65, 1, 15 }, { 2, 75, 1, 16 }, { 3, 30, 1, 10 },
{ 3, 50, 1, 12 }, { 3, 50, 2, 25 }, { 3, 60, 2, 35 }, { 3, 70, 2, 45 },
#endif // MAX_RADIUS == 2
#endif
};
#if CONFIG_MAX_TILE
......@@ -1110,97 +1102,6 @@ void av1_selfguided_restoration_c(const uint8_t *dgd, int width, int height,
dst_stride, 8, r, eps);
}
void av1_highpass_filter_c(const uint8_t *dgd, int width, int height,
int stride, int32_t *dst, int dst_stride, int corner,
int edge) {
int i, j;
const int center = (1 << SGRPROJ_RST_BITS) - 4 * (corner + edge);
i = 0;
j = 0;
{
const int k = i * stride + j;
const int l = i * dst_stride + j;
dst[l] =
center * dgd[k] + edge * (dgd[k + 1] + dgd[k + stride] + dgd[k] * 2) +
corner * (dgd[k + stride + 1] + dgd[k + 1] + dgd[k + stride] + dgd[k]);
}
i = 0;
j = width - 1;
{
const int k = i * stride + j;
const int l = i * dst_stride + j;
dst[l] =
center * dgd[k] + edge * (dgd[k - 1] + dgd[k + stride] + dgd[k] * 2) +
corner * (dgd[k + stride - 1] + dgd[k - 1] + dgd[k + stride] + dgd[k]);
}
i = height - 1;
j = 0;
{
const int k = i * stride + j;
const int l = i * dst_stride + j;
dst[l] =
center * dgd[k] + edge * (dgd[k + 1] + dgd[k - stride] + dgd[k] * 2) +
corner * (dgd[k - stride + 1] + dgd[k + 1] + dgd[k - stride] + dgd[k]);
}
i = height - 1;
j = width - 1;
{
const int k = i * stride + j;
const int l = i * dst_stride + j;
dst[l] =
center * dgd[k] + edge * (dgd[k - 1] + dgd[k - stride] + dgd[k] * 2) +
corner * (dgd[k - stride - 1] + dgd[k - 1] + dgd[k - stride] + dgd[k]);
}
i = 0;
for (j = 1; j < width - 1; ++j) {
const int k = i * stride + j;
const int l = i * dst_stride + j;
dst[l] = center * dgd[k] +
edge * (dgd[k - 1] + dgd[k + stride] + dgd[k + 1] + dgd[k]) +
corner * (dgd[k + stride - 1] + dgd[k + stride + 1] + dgd[k - 1] +
dgd[k + 1]);
}
i = height - 1;
for (j = 1; j < width - 1; ++j) {
const int k = i * stride + j;
const int l = i * dst_stride + j;
dst[l] = center * dgd[k] +
edge * (dgd[k - 1] + dgd[k - stride] + dgd[k + 1] + dgd[k]) +
corner * (dgd[k - stride - 1] + dgd[k - stride + 1] + dgd[k - 1] +
dgd[k + 1]);
}
j = 0;
for (i = 1; i < height - 1; ++i) {
const int k = i * stride + j;
const int l = i * dst_stride + j;
dst[l] = center * dgd[k] +
edge * (dgd[k - stride] + dgd[k + 1] + dgd[k + stride] + dgd[k]) +
corner * (dgd[k + stride + 1] + dgd[k - stride + 1] +
dgd[k - stride] + dgd[k + stride]);
}
j = width - 1;
for (i = 1; i < height - 1; ++i) {
const int k = i * stride + j;
const int l = i * dst_stride + j;
dst[l] = center * dgd[k] +
edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k]) +
corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
dgd[k - stride] + dgd[k + stride]);
}
for (i = 1; i < height - 1; ++i) {
for (j = 1; j < width - 1; ++j) {
const int k = i * stride + j;
const int l = i * dst_stride + j;
dst[l] =
center * dgd[k] +
edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k + 1]) +
corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
dgd[k - stride + 1] + dgd[k + stride + 1]);
}
}
}
void apply_selfguided_restoration_c(const uint8_t *dat, int width, int height,
int stride, int eps, const int *xqd,
uint8_t *dst, int dst_stride,
......@@ -1210,13 +1111,8 @@ void apply_selfguided_restoration_c(const uint8_t *dat, int width, int height,
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
int i, j;
assert(width * height <= RESTORATION_TILEPELS_MAX);
#if USE_HIGHPASS_IN_SGRPROJ
av1_highpass_filter_c(dat, width, height, stride, flt1, width,
sgr_params[eps].corner, sgr_params[eps].edge);
#else
av1_selfguided_restoration_c(dat, width, height, stride, flt1, width,
sgr_params[eps].r1, sgr_params[eps].e1);
#endif // USE_HIGHPASS_IN_SGRPROJ
av1_selfguided_restoration_c(dat, width, height, stride, flt2, width,
sgr_params[eps].r2, sgr_params[eps].e2);
decode_xq(xqd, xq);
......@@ -1295,97 +1191,6 @@ void av1_selfguided_restoration_highbd_c(const uint16_t *dgd, int width,
dst_stride, bit_depth, r, eps);
}
void av1_highpass_filter_highbd_c(const uint16_t *dgd, int width, int height,
int stride, int32_t *dst, int dst_stride,
int corner, int edge) {
int i, j;
const int center = (1 << SGRPROJ_RST_BITS) - 4 * (corner + edge);
i = 0;
j = 0;
{
const int k = i * stride + j;
const int l = i * dst_stride + j;
dst[l] =
center * dgd[k] + edge * (dgd[k + 1] + dgd[k + stride] + dgd[k] * 2) +
corner * (dgd[k + stride + 1] + dgd[k + 1] + dgd[k + stride] + dgd[k]);
}
i = 0;
j = width - 1;
{
const int k = i * stride + j;
const int l = i * dst_stride + j;
dst[l] =
center * dgd[k] + edge * (dgd[k - 1] + dgd[k + stride] + dgd[k] * 2) +
corner * (dgd[k + stride - 1] + dgd[k - 1] + dgd[k + stride] + dgd[k]);
}
i = height - 1;
j = 0;
{
const int k = i * stride + j;
const int l = i * dst_stride + j;
dst[l] =
center * dgd[k] + edge * (dgd[k + 1] + dgd[k - stride] + dgd[k] * 2) +
corner * (dgd[k - stride + 1] + dgd[k + 1] + dgd[k - stride] + dgd[k]);
}
i = height - 1;
j = width - 1;
{
const int k = i * stride + j;
const int l = i * dst_stride + j;
dst[l] =
center * dgd[k] + edge * (dgd[k - 1] + dgd[k - stride] + dgd[k] * 2) +
corner * (dgd[k - stride - 1] + dgd[k - 1] + dgd[k - stride] + dgd[k]);
}
i = 0;
for (j = 1; j < width - 1; ++j) {
const int k = i * stride + j;
const int l = i * dst_stride + j;
dst[l] = center * dgd[k] +
edge * (dgd[k - 1] + dgd[k + stride] + dgd[k + 1] + dgd[k]) +
corner * (dgd[k + stride - 1] + dgd[k + stride + 1] + dgd[k - 1] +
dgd[k + 1]);
}
i = height - 1;
for (j = 1; j < width - 1; ++j) {
const int k = i * stride + j;
const int l = i * dst_stride + j;
dst[l] = center * dgd[k] +
edge * (dgd[k - 1] + dgd[k - stride] + dgd[k + 1] + dgd[k]) +
corner * (dgd[k - stride - 1] + dgd[k - stride + 1] + dgd[k - 1] +
dgd[k + 1]);
}
j = 0;
for (i = 1; i < height - 1; ++i) {
const int k = i * stride + j;
const int l = i * dst_stride + j;
dst[l] = center * dgd[k] +
edge * (dgd[k - stride] + dgd[k + 1] + dgd[k + stride] + dgd[k]) +
corner * (dgd[k + stride + 1] + dgd[k - stride + 1] +
dgd[k - stride] + dgd[k + stride]);
}
j = width - 1;
for (i = 1; i < height - 1; ++i) {
const int k = i * stride + j;
const int l = i * dst_stride + j;
dst[l] = center * dgd[k] +
edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k]) +
corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
dgd[k - stride] + dgd[k + stride]);
}
for (i = 1; i < height - 1; ++i) {
for (j = 1; j < width - 1; ++j) {
const int k = i * stride + j;
const int l = i * dst_stride + j;
dst[l] =
center * dgd[k] +
edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k + 1]) +
corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
dgd[k - stride + 1] + dgd[k + stride + 1]);
}
}
}
void apply_selfguided_restoration_highbd_c(const uint16_t *dat, int width,
int height, int stride,
int bit_depth, int eps,
......@@ -1396,14 +1201,9 @@ void apply_selfguided_restoration_highbd_c(const uint16_t *dat, int width,
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
int i, j;
assert(width * height <= RESTORATION_TILEPELS_MAX);
#if USE_HIGHPASS_IN_SGRPROJ
av1_highpass_filter_highbd_c(dat, width, height, stride, flt1, width,
sgr_params[eps].corner, sgr_params[eps].edge);
#else
av1_selfguided_restoration_highbd_c(dat, width, height, stride, flt1, width,
bit_depth, sgr_params[eps].r1,
sgr_params[eps].e1);
#endif // USE_HIGHPASS_IN_SGRPROJ
av1_selfguided_restoration_highbd_c(dat, width, height, stride, flt2, width,
bit_depth, sgr_params[eps].r2,
sgr_params[eps].e2);
......
......@@ -105,7 +105,6 @@ extern "C" {
#define SGRPROJ_EXTBUF_SIZE (0)
#define SGRPROJ_PARAMS_BITS 4
#define SGRPROJ_PARAMS (1 << SGRPROJ_PARAMS_BITS)
#define USE_HIGHPASS_IN_SGRPROJ 0
// Precision bits for projection
#define SGRPROJ_PRJ_BITS 7
......@@ -115,17 +114,10 @@ extern "C" {
#define SGRPROJ_SGR_BITS 8
#define SGRPROJ_SGR (1 << SGRPROJ_SGR_BITS)
#if USE_HIGHPASS_IN_SGRPROJ
#define SGRPROJ_PRJ_MIN0 (-(1 << SGRPROJ_PRJ_BITS) / 8)
#define SGRPROJ_PRJ_MAX0 (SGRPROJ_PRJ_MIN0 + (1 << SGRPROJ_PRJ_BITS) - 1)
#define SGRPROJ_PRJ_MIN1 (-(1 << SGRPROJ_PRJ_BITS) / 2)
#define SGRPROJ_PRJ_MAX1 (SGRPROJ_PRJ_MIN1 + (1 << SGRPROJ_PRJ_BITS) - 1)
#else
#define SGRPROJ_PRJ_MIN0 (-(1 << SGRPROJ_PRJ_BITS) * 3 / 4)
#define SGRPROJ_PRJ_MAX0 (SGRPROJ_PRJ_MIN0 + (1 << SGRPROJ_PRJ_BITS) - 1)
#define SGRPROJ_PRJ_MIN1 (-(1 << SGRPROJ_PRJ_BITS) / 4)
#define SGRPROJ_PRJ_MAX1 (SGRPROJ_PRJ_MIN1 + (1 << SGRPROJ_PRJ_BITS) - 1)
#endif // USE_HIGHPASS_IN_SGRPROJ
#define SGRPROJ_PRJ_SUBEXP_K 4
......@@ -202,13 +194,8 @@ extern "C" {
#endif
typedef struct {
#if USE_HIGHPASS_IN_SGRPROJ
int corner;
int edge;
#else
int r1;
int e1;
#endif // USE_HIGHPASS_IN_SGRPROJ
int r2;
int e2;
} sgr_params_type;
......
......@@ -340,167 +340,6 @@ static void selfguided_restoration(const uint8_t *dgd8, int width, int height,
height, highbd);
}
void av1_highpass_filter_sse4_1(const uint8_t *dgd, int width, int height,
int stride, int32_t *dst, int dst_stride,
int corner, int edge) {
int i, j;
const int center = (1 << SGRPROJ_RST_BITS) - 4 * (corner + edge);
{
i = 0;
j = 0;
{
const int k = i * stride + j;
const int l = i * dst_stride + j;
dst[l] =
center * dgd[k] + edge * (dgd[k + 1] + dgd[k + stride] + dgd[k] * 2) +
corner *
(dgd[k + stride + 1] + dgd[k + 1] + dgd[k + stride] + dgd[k]);
}
for (j = 1; j < width - 1; ++j) {
const int k = i * stride + j;
const int l = i * dst_stride + j;
dst[l] = center * dgd[k] +
edge * (dgd[k - 1] + dgd[k + stride] + dgd[k + 1] + dgd[k]) +
corner * (dgd[k + stride - 1] + dgd[k + stride + 1] +
dgd[k - 1] + dgd[k + 1]);
}
j = width - 1;
{
const int k = i * stride + j;
const int l = i * dst_stride + j;
dst[l] =
center * dgd[k] + edge * (dgd[k - 1] + dgd[k + stride] + dgd[k] * 2) +
corner *
(dgd[k + stride - 1] + dgd[k - 1] + dgd[k + stride] + dgd[k]);
}
}
{
i = height - 1;
j = 0;
{
const int k = i * stride + j;
const int l = i * dst_stride + j;
dst[l] =
center * dgd[k] + edge * (dgd[k + 1] + dgd[k - stride] + dgd[k] * 2) +
corner *
(dgd[k - stride + 1] + dgd[k + 1] + dgd[k - stride] + dgd[k]);
}
for (j = 1; j < width - 1; ++j) {
const int k = i * stride + j;
const int l = i * dst_stride + j;
dst[l] = center * dgd[k] +
edge * (dgd[k - 1] + dgd[k - stride] + dgd[k + 1] + dgd[k]) +
corner * (dgd[k - stride - 1] + dgd[k - stride + 1] +
dgd[k - 1] + dgd[k + 1]);
}
j = width - 1;
{
const int k = i * stride + j;
const int l = i * dst_stride + j;
dst[l] =
center * dgd[k] + edge * (dgd[k - 1] + dgd[k - stride] + dgd[k] * 2) +
corner *
(dgd[k - stride - 1] + dgd[k - 1] + dgd[k - stride] + dgd[k]);
}
}
__m128i center_ = _mm_set1_epi16(center);
__m128i edge_ = _mm_set1_epi16(edge);
__m128i corner_ = _mm_set1_epi16(corner);
for (i = 1; i < height - 1; ++i) {
j = 0;
{
const int k = i * stride + j;
const int l = i * dst_stride + j;
dst[l] =
center * dgd[k] +
edge * (dgd[k - stride] + dgd[k + 1] + dgd[k + stride] + dgd[k]) +
corner * (dgd[k + stride + 1] + dgd[k - stride + 1] +
dgd[k - stride] + dgd[k + stride]);
}
// Process in units of 8 pixels at a time.
for (j = 1; j < width - 8; j += 8) {
const int k = i * stride + j;
const int l = i * dst_stride + j;
__m128i a = _mm_loadu_si128((__m128i *)&dgd[k - stride - 1]);
__m128i b = _mm_loadu_si128((__m128i *)&dgd[k - 1]);
__m128i c = _mm_loadu_si128((__m128i *)&dgd[k + stride - 1]);
__m128i tl = _mm_cvtepu8_epi16(a);
__m128i tr = _mm_cvtepu8_epi16(_mm_srli_si128(a, 8));
__m128i cl = _mm_cvtepu8_epi16(b);
__m128i cr = _mm_cvtepu8_epi16(_mm_srli_si128(b, 8));
__m128i bl = _mm_cvtepu8_epi16(c);
__m128i br = _mm_cvtepu8_epi16(_mm_srli_si128(c, 8));
__m128i x = _mm_alignr_epi8(cr, cl, 2);
__m128i y = _mm_add_epi16(_mm_add_epi16(_mm_alignr_epi8(tr, tl, 2), cl),
_mm_add_epi16(_mm_alignr_epi8(br, bl, 2),
_mm_alignr_epi8(cr, cl, 4)));
__m128i z = _mm_add_epi16(_mm_add_epi16(tl, bl),
_mm_add_epi16(_mm_alignr_epi8(tr, tl, 4),
_mm_alignr_epi8(br, bl, 4)));
__m128i res = _mm_add_epi16(_mm_mullo_epi16(x, center_),
_mm_add_epi16(_mm_mullo_epi16(y, edge_),
_mm_mullo_epi16(z, corner_)));
_mm_storeu_si128((__m128i *)&dst[l], _mm_cvtepi16_epi32(res));
_mm_storeu_si128((__m128i *)&dst[l + 4],
_mm_cvtepi16_epi32(_mm_srli_si128(res, 8)));
}
// If there are enough pixels left in this row, do another batch of 4
// pixels.
for (; j < width - 4; j += 4) {
const int k = i * stride + j;
const int l = i * dst_stride + j;
__m128i a = _mm_loadl_epi64((__m128i *)&dgd[k - stride - 1]);
__m128i b = _mm_loadl_epi64((__m128i *)&dgd[k - 1]);
__m128i c = _mm_loadl_epi64((__m128i *)&dgd[k + stride - 1]);
__m128i tl = _mm_cvtepu8_epi16(a);
__m128i cl = _mm_cvtepu8_epi16(b);
__m128i bl = _mm_cvtepu8_epi16(c);
__m128i x = _mm_srli_si128(cl, 2);
__m128i y = _mm_add_epi16(
_mm_add_epi16(_mm_srli_si128(tl, 2), cl),
_mm_add_epi16(_mm_srli_si128(bl, 2), _mm_srli_si128(cl, 4)));
__m128i z = _mm_add_epi16(
_mm_add_epi16(tl, bl),
_mm_add_epi16(_mm_srli_si128(tl, 4), _mm_srli_si128(bl, 4)));
__m128i res = _mm_add_epi16(_mm_mullo_epi16(x, center_),
_mm_add_epi16(_mm_mullo_epi16(y, edge_),
_mm_mullo_epi16(z, corner_)));
_mm_storeu_si128((__m128i *)&dst[l], _mm_cvtepi16_epi32(res));
}
// Handle any leftover pixels
for (; j < width - 1; ++j) {
const int k = i * stride + j;
const int l = i * dst_stride + j;
dst[l] =
center * dgd[k] +
edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k + 1]) +
corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
dgd[k - stride + 1] + dgd[k + stride + 1]);
}
j = width - 1;
{
const int k = i * stride + j;
const int l = i * dst_stride + j;
dst[l] =
center * dgd[k] +
edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k]) +
corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
dgd[k - stride] + dgd[k + stride]);
}
}
}
void av1_selfguided_restoration_sse4_1(const uint8_t *dgd, int width,
int height, int dgd_stride, int32_t *dst,
int dst_stride, int r, int eps) {
......@@ -517,13 +356,8 @@ void apply_selfguided_restoration_sse4_1(const uint8_t *dat, int width,
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
int i, j;
assert(width * height <= RESTORATION_TILEPELS_MAX);
#if USE_HIGHPASS_IN_SGRPROJ
av1_highpass_filter_sse4_1(dat, width, height, stride, flt1, width,
sgr_params[eps].corner, sgr_params[eps].edge);
#else
av1_selfguided_restoration_sse4_1(dat, width, height, stride, flt1, width,
sgr_params[eps].r1, sgr_params[eps].e1);
#endif // USE_HIGHPASS_IN_SGRPROJ
av1_selfguided_restoration_sse4_1(dat, width, height, stride, flt2, width,
sgr_params[eps].r2, sgr_params[eps].e2);
decode_xq(xqd, xq);
......@@ -578,137 +412,6 @@ void av1_selfguided_restoration_highbd_sse4_1(const uint16_t *dgd, int width,
dst, dst_stride, r, eps, bit_depth, 1);
}
void av1_highpass_filter_highbd_sse4_1(const uint16_t *dgd, int width,
int height, int stride, int32_t *dst,
int dst_stride, int corner, int edge) {
int i, j;
const int center = (1 << SGRPROJ_RST_BITS) - 4 * (corner + edge);
{
i = 0;
j = 0;
{
const int k = i * stride + j;
const int l = i * dst_stride + j;
dst[l] =
center * dgd[k] + edge * (dgd[k + 1] + dgd[k + stride] + dgd[k] * 2) +
corner *
(dgd[k + stride + 1] + dgd[k + 1] + dgd[k + stride] + dgd[k]);
}
for (j = 1; j < width - 1; ++j) {
const int k = i * stride + j;
const int l = i * dst_stride + j;
dst[l] = center * dgd[k] +
edge * (dgd[k - 1] + dgd[k + stride] + dgd[k + 1] + dgd[k]) +
corner * (dgd[k + stride - 1] + dgd[k + stride + 1] +
dgd[k - 1] + dgd[k + 1]);
}
j = width - 1;
{
const int k = i * stride + j;
const int l = i * dst_stride + j;
dst[l] =
center * dgd[k] + edge * (dgd[k - 1] + dgd[k + stride] + dgd[k] * 2) +
corner *
(dgd[k + stride - 1] + dgd[k - 1] + dgd[k + stride] + dgd[k]);
}
}
__m128i center_ = _mm_set1_epi32(center);
__m128i edge_ = _mm_set1_epi32(edge);
__m128i corner_ = _mm_set1_epi32(corner);
for (i = 1; i < height - 1; ++i) {
j = 0;
{
const int k = i * stride + j;
const int l = i * dst_stride + j;
dst[l] =
center * dgd[k] +
edge * (dgd[k - stride] + dgd[k + 1] + dgd[k + stride] + dgd[k]) +
corner * (dgd[k + stride + 1] + dgd[k - stride + 1] +
dgd[k - stride] + dgd[k + stride]);
}
// Process 4 pixels at a time
for (j = 1; j < width - 4; j += 4) {
const int k = i * stride + j;
const int l = i * dst_stride + j;
__m128i a = _mm_loadu_si128((__m128i *)&dgd[k - stride - 1]);
__m128i b = _mm_loadu_si128((__m128i *)&dgd[k - 1]);
__m128i c = _mm_loadu_si128((__m128i *)&dgd[k + stride - 1]);
__m128i tl = _mm_cvtepu16_epi32(a);
__m128i tr = _mm_cvtepu16_epi32(_mm_srli_si128(a, 8));
__m128i cl = _mm_cvtepu16_epi32(b);
__m128i cr = _mm_cvtepu16_epi32(_mm_srli_si128(b, 8));
__m128i bl = _mm_cvtepu16_epi32(c);
__m128i br = _mm_cvtepu16_epi32(_mm_srli_si128(c, 8));
__m128i x = _mm_alignr_epi8(cr, cl, 4);
__m128i y = _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(tr, tl, 4), cl),
_mm_add_epi32(_mm_alignr_epi8(br, bl, 4),
_mm_alignr_epi8(cr, cl, 8)));
__m128i z = _mm_add_epi32(_mm_add_epi32(tl, bl),
_mm_add_epi32(_mm_alignr_epi8(tr, tl, 8),
_mm_alignr_epi8(br, bl, 8)));
__m128i res = _mm_add_epi32(_mm_mullo_epi32(x, center_),
_mm_add_epi32(_mm_mullo_epi32(y, edge_),
_mm_mullo_epi32(z, corner_)));
_mm_storeu_si128((__m128i *)&dst[l], res);
}
// Handle any leftover pixels
for (; j < width - 1; ++j) {