Commit 506eb723 authored by David Barker's avatar David Barker Committed by Debargha Mukherjee

Make encoder use vectorized self-guided filter

By rearranging the code in restoration.c, we can allow the
encoder to use the SSE4.1 version of the self-guided filter
while picking the loop-restoration filter.

This also helps us prepare for adding a highbitdepth SSE4.1
version of the self-guided filter.

No effect on encoder output, but gives an end-to-end speedup
of 1-2%.

Change-Id: Id17ba4a0963ddce9f70a7cae666e212e138d5f2c
parent cff43bb2
......@@ -782,6 +782,17 @@ if ((aom_config("CONFIG_WARPED_MOTION") eq "yes") ||
if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
add_proto qw/void apply_selfguided_restoration/, "uint8_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf";
specialize qw/apply_selfguided_restoration sse4_1/;
add_proto qw/void av1_selfguided_restoration/, "uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps, int32_t *tmpbuf";
specialize qw/av1_selfguided_restoration sse4_1/;
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto qw/void apply_selfguided_restoration_highbd/, "uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf";
specialize qw/apply_selfguided_restoration_highbd/;
add_proto qw/void av1_selfguided_restoration_highbd/, "uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps, int32_t *tmpbuf";
specialize qw/av1_selfguided_restoration_highbd/;
}
}
1;
......@@ -611,9 +611,10 @@ const int32_t one_by_x[MAX_NELEM] = {
};
#endif // APPROXIMATE_SGR
void av1_selfguided_restoration(int32_t *dgd, int width, int height, int stride,
int bit_depth, int r, int eps,
int32_t *tmpbuf) {
static void av1_selfguided_restoration_internal(int32_t *dgd, int width,
int height, int stride,
int bit_depth, int r, int eps,
int32_t *tmpbuf) {
int32_t *A = tmpbuf;
int32_t *B = A + SGRPROJ_OUTBUF_SIZE;
int8_t num[RESTORATION_TILEPELS_MAX];
......@@ -812,6 +813,20 @@ void av1_selfguided_restoration(int32_t *dgd, int width, int height, int stride,
#endif // APPROXIMATE_SGR
}
void av1_selfguided_restoration_c(uint8_t *dgd, int width, int height,
int stride, int32_t *dst, int dst_stride,
int bit_depth, int r, int eps,
int32_t *tmpbuf) {
int i, j;
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
dst[i * dst_stride + j] = dgd[i * stride + j];
}
}
av1_selfguided_restoration_internal(dst, width, height, dst_stride, bit_depth,
r, eps, tmpbuf);
}
void apply_selfguided_restoration_c(uint8_t *dat, int width, int height,
int stride, int bit_depth, int eps,
int *xqd, uint8_t *dst, int dst_stride,
......@@ -822,16 +837,12 @@ void apply_selfguided_restoration_c(uint8_t *dat, int width, int height,
int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
int i, j;
assert(width * height <= RESTORATION_TILEPELS_MAX);
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
flt1[i * width + j] = dat[i * stride + j];
flt2[i * width + j] = dat[i * stride + j];
}
}
av1_selfguided_restoration(flt1, width, height, width, bit_depth,
sgr_params[eps].r1, sgr_params[eps].e1, tmpbuf2);
av1_selfguided_restoration(flt2, width, height, width, bit_depth,
sgr_params[eps].r2, sgr_params[eps].e2, tmpbuf2);
av1_selfguided_restoration_c(dat, width, height, stride, flt1, width,
bit_depth, sgr_params[eps].r1,
sgr_params[eps].e1, tmpbuf2);
av1_selfguided_restoration_c(dat, width, height, stride, flt2, width,
bit_depth, sgr_params[eps].r2,
sgr_params[eps].e2, tmpbuf2);
decode_xq(xqd, xq);
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
......@@ -1151,25 +1162,36 @@ static void loop_wiener_filter_highbd(uint8_t *data8, int width, int height,
}
}
static void apply_selfguided_restoration_highbd(
uint16_t *dat, int width, int height, int stride, int bit_depth, int eps,
int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf) {
void av1_selfguided_restoration_highbd_c(uint16_t *dgd, int width, int height,
int stride, int32_t *dst,
int dst_stride, int bit_depth, int r,
int eps, int32_t *tmpbuf) {
int i, j;
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
dst[i * dst_stride + j] = dgd[i * stride + j];
}
}
av1_selfguided_restoration_internal(dst, width, height, dst_stride, bit_depth,
r, eps, tmpbuf);
}
void apply_selfguided_restoration_highbd_c(uint16_t *dat, int width, int height,
int stride, int bit_depth, int eps,
int *xqd, uint16_t *dst,
int dst_stride, int32_t *tmpbuf) {
int xq[2];
int32_t *flt1 = tmpbuf;
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
int i, j;
assert(width * height <= RESTORATION_TILEPELS_MAX);
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
flt1[i * width + j] = dat[i * stride + j];
flt2[i * width + j] = dat[i * stride + j];
}
}
av1_selfguided_restoration(flt1, width, height, width, bit_depth,
sgr_params[eps].r1, sgr_params[eps].e1, tmpbuf2);
av1_selfguided_restoration(flt2, width, height, width, bit_depth,
sgr_params[eps].r2, sgr_params[eps].e2, tmpbuf2);
av1_selfguided_restoration_highbd_c(dat, width, height, stride, flt1, width,
bit_depth, sgr_params[eps].r1,
sgr_params[eps].e1, tmpbuf2);
av1_selfguided_restoration_highbd_c(dat, width, height, stride, flt2, width,
bit_depth, sgr_params[eps].r2,
sgr_params[eps].e2, tmpbuf2);
decode_xq(xqd, xq);
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
......
......@@ -249,8 +249,6 @@ int av1_alloc_restoration_struct(struct AV1Common *cm,
void av1_free_restoration_struct(RestorationInfo *rst_info);
void extend_frame(uint8_t *data, int width, int height, int stride);
void av1_selfguided_restoration(int32_t *dgd, int width, int height, int stride,
int bit_depth, int r, int eps, int32_t *tmpbuf);
#if USE_DOMAINTXFMRF
void av1_domaintxfmrf_restoration(uint8_t *dgd, int width, int height,
int stride, int param, uint8_t *dst,
......
......@@ -286,35 +286,32 @@ static void search_selfguided_restoration(uint8_t *dat8, int width, int height,
int32_t *flt1 = rstbuf;
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
int i, j, ep, bestep = 0;
int ep, bestep = 0;
int64_t err, besterr = -1;
int exqd[2], bestxqd[2] = { 0, 0 };
for (ep = 0; ep < SGRPROJ_PARAMS; ep++) {
int exq[2];
#if CONFIG_AOM_HIGHBITDEPTH
if (bit_depth > 8) {
uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
flt1[i * width + j] = (int32_t)dat[i * dat_stride + j];
flt2[i * width + j] = (int32_t)dat[i * dat_stride + j];
}
}
av1_selfguided_restoration_highbd(dat, width, height, dat_stride, flt1,
width, bit_depth, sgr_params[ep].r1,
sgr_params[ep].e1, tmpbuf2);
av1_selfguided_restoration_highbd(dat, width, height, dat_stride, flt2,
width, bit_depth, sgr_params[ep].r2,
sgr_params[ep].e2, tmpbuf2);
} else {
uint8_t *dat = dat8;
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
const int k = i * width + j;
const int l = i * dat_stride + j;
flt1[k] = (int32_t)dat[l];
flt2[k] = (int32_t)dat[l];
}
}
#endif
av1_selfguided_restoration(dat8, width, height, dat_stride, flt1, width,
bit_depth, sgr_params[ep].r1,
sgr_params[ep].e1, tmpbuf2);
av1_selfguided_restoration(dat8, width, height, dat_stride, flt2, width,
bit_depth, sgr_params[ep].r2,
sgr_params[ep].e2, tmpbuf2);
#if CONFIG_AOM_HIGHBITDEPTH
}
av1_selfguided_restoration(flt1, width, height, width, bit_depth,
sgr_params[ep].r1, sgr_params[ep].e1, tmpbuf2);
av1_selfguided_restoration(flt2, width, height, width, bit_depth,
sgr_params[ep].r2, sgr_params[ep].e2, tmpbuf2);
#endif
get_proj_subspace(src8, width, height, src_stride, dat8, dat_stride,
bit_depth, flt1, width, flt2, width, exq);
encode_xq(exq, exqd);
......
......@@ -68,8 +68,8 @@ class AV1SelfguidedFilterTest
std::clock_t start = std::clock();
for (i = 0; i < NUM_ITERS; ++i) {
apply_selfguided_restoration(input, w, h, w, 8, eps, xqd, output, w,
tmpbuf);
apply_selfguided_restoration_c(input, w, h, w, 8, eps, xqd, output, w,
tmpbuf);
}
std::clock_t end = std::clock();
double elapsed = ((end - start) / (double)CLOCKS_PER_SEC);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment