Commit 625e50bd authored by Rupert Swarbrick's avatar Rupert Swarbrick

Get rid of the highbd versions of the SGR code

This doesn't have a big performance impact, and it's rather simpler
just having one version of everything.

Change-Id: I5fa5e7640a63d0ccb0c371f266c6eee99d9520f9
parent 7cf60961
......@@ -551,19 +551,11 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
# LOOP_RESTORATION functions
if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
add_proto qw/void apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf";
add_proto qw/void apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd";
specialize qw/apply_selfguided_restoration sse4_1/;
add_proto qw/void av1_selfguided_restoration/, "const uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps";
add_proto qw/void av1_selfguided_restoration/, "const uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps, int bit_depth, int highbd";
specialize qw/av1_selfguided_restoration sse4_1/;
if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
add_proto qw/void apply_selfguided_restoration_highbd/, "const uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, const int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf";
specialize qw/apply_selfguided_restoration_highbd sse4_1/;
add_proto qw/void av1_selfguided_restoration_highbd/, "const uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps";
specialize qw/av1_selfguided_restoration_highbd sse4_1/;
}
}
# CONVOLVE_ROUND/COMPOUND_ROUND functions
......
......@@ -1085,49 +1085,68 @@ static void av1_selfguided_restoration_internal(int32_t *dgd, int width,
}
}
void av1_selfguided_restoration_c(const uint8_t *dgd, int width, int height,
void av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
int stride, int32_t *dst, int dst_stride,
int r, int eps) {
int r, int eps, int bit_depth, int highbd) {
int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
int32_t *dgd32 =
dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
int i, j;
for (i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
for (j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
dgd32[i * dgd32_stride + j] = dgd[i * stride + j];
if (highbd) {
const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8);
for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
dgd32[i * dgd32_stride + j] = dgd16[i * stride + j];
}
}
} else {
for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
dgd32[i * dgd32_stride + j] = dgd8[i * stride + j];
}
}
}
av1_selfguided_restoration_internal(dgd32, width, height, dgd32_stride, dst,
dst_stride, 8, r, eps);
dst_stride, bit_depth, r, eps);
}
void apply_selfguided_restoration_c(const uint8_t *dat, int width, int height,
void apply_selfguided_restoration_c(const uint8_t *dat8, int width, int height,
int stride, int eps, const int *xqd,
uint8_t *dst, int dst_stride,
int32_t *tmpbuf) {
uint8_t *dst8, int dst_stride,
int32_t *tmpbuf, int bit_depth,
int highbd) {
int xq[2];
int32_t *flt1 = tmpbuf;
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
int i, j;
assert(width * height <= RESTORATION_TILEPELS_MAX);
av1_selfguided_restoration_c(dat, width, height, stride, flt1, width,
sgr_params[eps].r1, sgr_params[eps].e1);
av1_selfguided_restoration_c(dat, width, height, stride, flt2, width,
sgr_params[eps].r2, sgr_params[eps].e2);
av1_selfguided_restoration_c(dat8, width, height, stride, flt1, width,
sgr_params[eps].r1, sgr_params[eps].e1,
bit_depth, highbd);
av1_selfguided_restoration_c(dat8, width, height, stride, flt2, width,
sgr_params[eps].r2, sgr_params[eps].e2,
bit_depth, highbd);
decode_xq(xqd, xq);
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
for (int i = 0; i < height; ++i) {
for (int j = 0; j < width; ++j) {
const int k = i * width + j;
const int l = i * stride + j;
const int m = i * dst_stride + j;
const int32_t u = ((int32_t)dat[l] << SGRPROJ_RST_BITS);
const int32_t f1 = (int32_t)flt1[k] - u;
const int32_t f2 = (int32_t)flt2[k] - u;
uint8_t *dst8ij = dst8 + i * dst_stride + j;
const uint8_t *dat8ij = dat8 + i * stride + j;
const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij;
const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS;
const int32_t f1 = flt1[k] - u;
const int32_t f2 = flt2[k] - u;
const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
const int16_t w =
(int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
dst[m] = clip_pixel(w);
const uint16_t out = clip_pixel_highbd(w, bit_depth);
if (highbd)
*CONVERT_TO_SHORTPTR(dst8ij) = out;
else
*dst8ij = out;
}
}
}
......@@ -1144,7 +1163,7 @@ static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
int w = AOMMIN(procunit_width, stripe_width - j);
apply_selfguided_restoration(src + j, w, stripe_height, src_stride,
rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
dst + j, dst_stride, tmpbuf);
dst + j, dst_stride, tmpbuf, bit_depth, 0);
}
}
......@@ -1173,57 +1192,6 @@ static void wiener_filter_stripe_highbd(const RestorationUnitInfo *rui,
}
}
void av1_selfguided_restoration_highbd_c(const uint16_t *dgd, int width,
int height, int stride, int32_t *dst,
int dst_stride, int bit_depth, int r,
int eps) {
int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
int32_t *dgd32 =
dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
int i, j;
for (i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
for (j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
dgd32[i * dgd32_stride + j] = dgd[i * stride + j];
}
}
av1_selfguided_restoration_internal(dgd32, width, height, dgd32_stride, dst,
dst_stride, bit_depth, r, eps);
}
void apply_selfguided_restoration_highbd_c(const uint16_t *dat, int width,
int height, int stride,
int bit_depth, int eps,
const int *xqd, uint16_t *dst,
int dst_stride, int32_t *tmpbuf) {
int xq[2];
int32_t *flt1 = tmpbuf;
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
int i, j;
assert(width * height <= RESTORATION_TILEPELS_MAX);
av1_selfguided_restoration_highbd_c(dat, width, height, stride, flt1, width,
bit_depth, sgr_params[eps].r1,
sgr_params[eps].e1);
av1_selfguided_restoration_highbd_c(dat, width, height, stride, flt2, width,
bit_depth, sgr_params[eps].r2,
sgr_params[eps].e2);
decode_xq(xqd, xq);
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
const int k = i * width + j;
const int l = i * stride + j;
const int m = i * dst_stride + j;
const int32_t u = ((int32_t)dat[l] << SGRPROJ_RST_BITS);
const int32_t f1 = (int32_t)flt1[k] - u;
const int32_t f2 = (int32_t)flt2[k] - u;
const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
const int16_t w =
(int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
dst[m] = (uint16_t)clip_pixel_highbd(w, bit_depth);
}
}
}
static void sgrproj_filter_stripe_highbd(const RestorationUnitInfo *rui,
int stripe_width, int stripe_height,
int procunit_width,
......@@ -1232,11 +1200,9 @@ static void sgrproj_filter_stripe_highbd(const RestorationUnitInfo *rui,
int32_t *tmpbuf, int bit_depth) {
for (int j = 0; j < stripe_width; j += procunit_width) {
int w = AOMMIN(procunit_width, stripe_width - j);
const uint16_t *data_p = CONVERT_TO_SHORTPTR(src8) + j;
uint16_t *dst_p = CONVERT_TO_SHORTPTR(dst8) + j;
apply_selfguided_restoration_highbd(
data_p, w, stripe_height, src_stride, bit_depth, rui->sgrproj_info.ep,
rui->sgrproj_info.xqd, dst_p, dst_stride, tmpbuf);
apply_selfguided_restoration(src8 + j, w, stripe_height, src_stride,
rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
}
}
#endif // CONFIG_HIGHBITDEPTH
......
......@@ -282,9 +282,10 @@ static void final_filter(int32_t *dst, int dst_stride, const int32_t *A,
}
}
static void selfguided_restoration(const uint8_t *dgd8, int width, int height,
int dgd_stride, int32_t *dst, int dst_stride,
int r, int eps, int bit_depth, int highbd) {
void av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width,
int height, int dgd_stride, int32_t *dst,
int dst_stride, int r, int eps,
int bit_depth, int highbd) {
DECLARE_ALIGNED(16, int32_t, buf[4 * RESTORATION_PROC_UNIT_PELS]);
const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
......@@ -340,42 +341,43 @@ static void selfguided_restoration(const uint8_t *dgd8, int width, int height,
height, highbd);
}
void av1_selfguided_restoration_sse4_1(const uint8_t *dgd, int width,
int height, int dgd_stride, int32_t *dst,
int dst_stride, int r, int eps) {
selfguided_restoration(dgd, width, height, dgd_stride, dst, dst_stride, r,
eps, 8, 0);
}
void apply_selfguided_restoration_sse4_1(const uint8_t *dat, int width,
void apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
int height, int stride, int eps,
const int *xqd, uint8_t *dst,
int dst_stride, int32_t *tmpbuf) {
int xq[2];
const int *xqd, uint8_t *dst8,
int dst_stride, int32_t *tmpbuf,
int bit_depth, int highbd) {
int32_t *flt1 = tmpbuf;
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
int i, j;
assert(width * height <= RESTORATION_TILEPELS_MAX);
av1_selfguided_restoration_sse4_1(dat, width, height, stride, flt1, width,
sgr_params[eps].r1, sgr_params[eps].e1);
av1_selfguided_restoration_sse4_1(dat, width, height, stride, flt2, width,
sgr_params[eps].r2, sgr_params[eps].e2);
av1_selfguided_restoration_sse4_1(dat8, width, height, stride, flt1, width,
sgr_params[eps].r1, sgr_params[eps].e1,
bit_depth, highbd);
av1_selfguided_restoration_sse4_1(dat8, width, height, stride, flt2, width,
sgr_params[eps].r2, sgr_params[eps].e2,
bit_depth, highbd);
int xq[2];
decode_xq(xqd, xq);
__m128i xq0 = _mm_set1_epi32(xq[0]);
__m128i xq1 = _mm_set1_epi32(xq[1]);
for (i = 0; i < height; ++i) {
for (int i = 0; i < height; ++i) {
// Calculate output in batches of 8 pixels
for (j = 0; j < width; j += 8) {
for (int j = 0; j < width; j += 8) {
const int k = i * width + j;
const int l = i * stride + j;
const int m = i * dst_stride + j;
__m128i src =
_mm_slli_epi16(_mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&dat[l])),
SGRPROJ_RST_BITS);
const __m128i u_0 = _mm_cvtepu16_epi32(src);
const __m128i u_1 = _mm_cvtepu16_epi32(_mm_srli_si128(src, 8));
const uint8_t *dat8ij = dat8 + i * stride + j;
__m128i src;
if (highbd) {
src = xx_load_128(CONVERT_TO_SHORTPTR(dat8ij));
} else {
src = _mm_cvtepu8_epi16(xx_loadl_64(dat8ij));
}
const __m128i u = _mm_slli_epi16(src, SGRPROJ_RST_BITS);
const __m128i u_0 = _mm_cvtepu16_epi32(u);
const __m128i u_1 = _mm_cvtepu16_epi32(_mm_srli_si128(u, 8));
const __m128i f1_0 = _mm_sub_epi32(xx_loadu_128(&flt1[k]), u_0);
const __m128i f2_0 = _mm_sub_epi32(xx_loadu_128(&flt2[k]), u_0);
......@@ -396,83 +398,18 @@ void apply_selfguided_restoration_sse4_1(const uint8_t *dat, int width,
const __m128i w_1 = _mm_srai_epi32(_mm_add_epi32(v_1, rounding),
SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
const __m128i tmp = _mm_packs_epi32(w_0, w_1);
const __m128i res = _mm_packus_epi16(tmp, tmp /* "don't care" value */);
xx_storel_64(&dst[m], res);
if (highbd) {
// Pack into 16 bits and clamp to [0, 2^bit_depth)
const __m128i tmp = _mm_packus_epi32(w_0, w_1);
const __m128i max = _mm_set1_epi16((1 << bit_depth) - 1);
const __m128i res = _mm_min_epi16(tmp, max);
xx_store_128(CONVERT_TO_SHORTPTR(dst8 + m), res);
} else {
// Pack into 8 bits and clamp to [0, 256)
const __m128i tmp = _mm_packs_epi32(w_0, w_1);
const __m128i res = _mm_packus_epi16(tmp, tmp /* "don't care" value */);
xx_storel_64(dst8 + m, res);
}
}
}
}
#if CONFIG_HIGHBITDEPTH
void av1_selfguided_restoration_highbd_sse4_1(const uint16_t *dgd, int width,
int height, int dgd_stride,
int32_t *dst, int dst_stride,
int bit_depth, int r, int eps) {
selfguided_restoration(CONVERT_TO_BYTEPTR(dgd), width, height, dgd_stride,
dst, dst_stride, r, eps, bit_depth, 1);
}
void apply_selfguided_restoration_highbd_sse4_1(
const uint16_t *dat, int width, int height, int stride, int bit_depth,
int eps, const int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf) {
int xq[2];
int32_t *flt1 = tmpbuf;
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
int i, j;
assert(width * height <= RESTORATION_TILEPELS_MAX);
av1_selfguided_restoration_highbd_sse4_1(dat, width, height, stride, flt1,
width, bit_depth, sgr_params[eps].r1,
sgr_params[eps].e1);
av1_selfguided_restoration_highbd_sse4_1(dat, width, height, stride, flt2,
width, bit_depth, sgr_params[eps].r2,
sgr_params[eps].e2);
decode_xq(xqd, xq);
__m128i xq0 = _mm_set1_epi32(xq[0]);
__m128i xq1 = _mm_set1_epi32(xq[1]);
for (i = 0; i < height; ++i) {
// Calculate output in batches of 8 pixels
for (j = 0; j < width; j += 8) {
const int k = i * width + j;
const int l = i * stride + j;
const int m = i * dst_stride + j;
__m128i src =
_mm_slli_epi16(_mm_load_si128((__m128i *)&dat[l]), SGRPROJ_RST_BITS);
const __m128i u_0 = _mm_cvtepu16_epi32(src);
const __m128i u_1 = _mm_cvtepu16_epi32(_mm_srli_si128(src, 8));
const __m128i f1_0 =
_mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt1[k]), u_0);
const __m128i f2_0 =
_mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt2[k]), u_0);
const __m128i f1_1 =
_mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt1[k + 4]), u_1);
const __m128i f2_1 =
_mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt2[k + 4]), u_1);
const __m128i v_0 = _mm_add_epi32(
_mm_add_epi32(_mm_mullo_epi32(xq0, f1_0), _mm_mullo_epi32(xq1, f2_0)),
_mm_slli_epi32(u_0, SGRPROJ_PRJ_BITS));
const __m128i v_1 = _mm_add_epi32(
_mm_add_epi32(_mm_mullo_epi32(xq0, f1_1), _mm_mullo_epi32(xq1, f2_1)),
_mm_slli_epi32(u_1, SGRPROJ_PRJ_BITS));
const __m128i rounding =
round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
const __m128i w_0 = _mm_srai_epi32(_mm_add_epi32(v_0, rounding),
SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
const __m128i w_1 = _mm_srai_epi32(_mm_add_epi32(v_1, rounding),
SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
// Pack into 16 bits and clamp to [0, 2^bit_depth)
const __m128i tmp = _mm_packus_epi32(w_0, w_1);
const __m128i max = _mm_set1_epi16((1 << bit_depth) - 1);
const __m128i res = _mm_min_epi16(tmp, max);
_mm_store_si128((__m128i *)&dst[m], res);
}
}
}
#endif
......@@ -372,26 +372,10 @@ static void sgr_filter_block(const sgr_params_type *params, const uint8_t *dat8,
int width, int height, int dat_stride,
int use_highbd, int bit_depth, int32_t *flt1,
int32_t *flt2, int flt_stride) {
#if CONFIG_HIGHBITDEPTH
if (use_highbd) {
const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
av1_selfguided_restoration_highbd(dat, width, height, dat_stride, flt1,
flt_stride, bit_depth, params->r1,
params->e1);
av1_selfguided_restoration_highbd(dat, width, height, dat_stride, flt2,
flt_stride, bit_depth, params->r2,
params->e2);
return;
}
#else
(void)use_highbd;
(void)bit_depth;
#endif // CONFIG_HIGHBITDEPTH
av1_selfguided_restoration(dat8, width, height, dat_stride, flt1, flt_stride,
params->r1, params->e1);
params->r1, params->e1, bit_depth, use_highbd);
av1_selfguided_restoration(dat8, width, height, dat_stride, flt2, flt_stride,
params->r2, params->e2);
params->r2, params->e2, bit_depth, use_highbd);
}
// Apply the self-guided filter across an entire restoration unit.
......
......@@ -81,7 +81,7 @@ class AV1SelfguidedFilterTest
uint8_t *input_p = input + k * stride + j;
uint8_t *output_p = output + k * out_stride + j;
apply_selfguided_restoration(input_p, w, h, stride, eps, xqd,
output_p, out_stride, tmpbuf);
output_p, out_stride, tmpbuf, 8, 0);
}
}
std::clock_t end = std::clock();
......@@ -146,9 +146,9 @@ class AV1SelfguidedFilterTest
uint8_t *output_p = output + k * out_stride + j;
uint8_t *output2_p = output2 + k * out_stride + j;
apply_selfguided_restoration(input_p, w, h, stride, eps, xqd,
output_p, out_stride, tmpbuf);
output_p, out_stride, tmpbuf, 8, 0);
apply_selfguided_restoration_c(input_p, w, h, stride, eps, xqd,
output2_p, out_stride, tmpbuf);
output2_p, out_stride, tmpbuf, 8, 0);
}
/*
apply_selfguided_restoration(input, test_w, test_h, stride, eps, xqd,
......@@ -234,9 +234,9 @@ class AV1HighbdSelfguidedFilterTest
int h = AOMMIN(pu_height, height - k);
uint16_t *input_p = input + k * stride + j;
uint16_t *output_p = output + k * out_stride + j;
apply_selfguided_restoration_highbd(input_p, w, h, stride, bit_depth,
eps, xqd, output_p, out_stride,
tmpbuf);
apply_selfguided_restoration(
CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
CONVERT_TO_BYTEPTR(output_p), out_stride, tmpbuf, bit_depth, 1);
}
}
std::clock_t end = std::clock();
......@@ -302,12 +302,12 @@ class AV1HighbdSelfguidedFilterTest
uint16_t *input_p = input + k * stride + j;
uint16_t *output_p = output + k * out_stride + j;
uint16_t *output2_p = output2 + k * out_stride + j;
apply_selfguided_restoration_highbd(input_p, w, h, stride, bit_depth,
eps, xqd, output_p, out_stride,
tmpbuf);
apply_selfguided_restoration_highbd_c(input_p, w, h, stride,
bit_depth, eps, xqd, output2_p,
out_stride, tmpbuf);
apply_selfguided_restoration(
CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
CONVERT_TO_BYTEPTR(output_p), out_stride, tmpbuf, bit_depth, 1);
apply_selfguided_restoration_c(
CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
CONVERT_TO_BYTEPTR(output2_p), out_stride, tmpbuf, bit_depth, 1);
}
/*
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment