Commit c4b69eb0 authored by hui su's avatar hui su

Speed-up for ext-intra

-Avoid unnecessary calculations
-Use SIMD when possible

Encoder is about 5% faster with the extra intra prediction angles
enabled.

Change-Id: I131056befe327cedab217ad4a40d5f2a11318acc
parent 01073732
......@@ -220,7 +220,7 @@ const InterpKernel *vp10_filter_kernels[SWITCHABLE_FILTERS + 1] = {
#if CONFIG_EXT_INTRA
const InterpKernel *vp10_intra_filter_kernels[INTRA_FILTERS] = {
NULL, // INTRA_FILTER_LINEAR
bilinear_filters, // INTRA_FILTER_LINEAR
sub_pel_filters_8, // INTRA_FILTER_8TAP
sub_pel_filters_8sharp, // INTRA_FILTER_8TAP_SHARP
sub_pel_filters_8smooth, // INTRA_FILTER_8TAP_SMOOTH
......
......@@ -276,28 +276,95 @@ static int intra_subpel_interp(int base, int shift, const uint8_t *ref,
static void dr_prediction_z1(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left,
int dx, int dy, INTRA_FILTER filter_type) {
int r, c, x, y, base, shift, val;
int r, c, x, base, shift, val;
(void)left;
(void)dy;
assert(dy == 1);
assert(dx < 0);
for (r = 0; r < bs; ++r) {
y = r + 1;
for (c = 0; c < bs; ++c) {
x = (c << 8) - y * dx;
if (filter_type != INTRA_FILTER_LINEAR) {
const int pad_size = SUBPEL_TAPS >> 1;
int len;
DECLARE_ALIGNED(16, uint8_t, buf[SUBPEL_SHIFTS][64]);
DECLARE_ALIGNED(16, uint8_t, src[64 + SUBPEL_TAPS]);
uint8_t flags[SUBPEL_SHIFTS];
memset(flags, 0, SUBPEL_SHIFTS * sizeof(flags[0]));
memset(src, above[0], pad_size * sizeof(above[0]));
memcpy(src + pad_size, above, 2 * bs * sizeof(above[0]));
memset(src + pad_size + 2 * bs, above[2 * bs - 1],
pad_size * sizeof(above[0]));
flags[0] = 1;
x = -dx;
for (r = 0; r < bs; ++r, dst += stride, x -= dx) {
base = x >> 8;
shift = x - (base << 8);
shift = x & 0xFF;
shift = ROUND_POWER_OF_TWO(shift, 8 - SUBPEL_BITS);
if (shift == SUBPEL_SHIFTS) {
base += 1;
shift = 0;
}
len = VPXMIN(bs, 2 * bs - 1 - base);
if (len <= 0) {
int i;
for (i = r; i < bs; ++i) {
memset(dst, above[2 * bs - 1], bs * sizeof(dst[0]));
dst += stride;
}
return;
}
if (len <= (bs >> 1) && !flags[shift]) {
base = x >> 8;
shift = x & 0xFF;
for (c = 0; c < len; ++c) {
val = intra_subpel_interp(base, shift, above, 0, 2 * bs - 1,
filter_type);
dst[c] = clip_pixel(val);
++base;
}
} else {
if (!flags[shift]) {
vpx_convolve8_horiz(src + pad_size, 2 * bs, buf[shift], 2 * bs,
vp10_intra_filter_kernels[filter_type][shift], 16,
NULL, 16, 2 * bs, 2 * bs < 16 ? 2 : 1);
flags[shift] = 1;
}
memcpy(dst, shift == 0 ? src + pad_size + base : &buf[shift][base],
len * sizeof(dst[0]));
}
if (len < bs)
memset(dst + len, above[2 * bs - 1], (bs - len) * sizeof(dst[0]));
}
return;
}
// For linear filter, C code is faster.
x = -dx;
for (r = 0; r < bs; ++r, dst += stride, x -= dx) {
base = x >> 8;
shift = x & 0xFF;
if (base >= 2 * bs - 1) {
int i;
for (i = r; i < bs; ++i) {
memset(dst, above[2 * bs - 1], bs * sizeof(dst[0]));
dst += stride;
}
return;
}
for (c = 0; c < bs; ++c, ++base) {
if (base < 2 * bs - 1) {
val = intra_subpel_interp(base, shift, above, 0, 2 * bs - 1,
filter_type);
val = above[base] * (256 - shift) + above[base + 1] * shift;
val = ROUND_POWER_OF_TWO(val, 8);
dst[c] = clip_pixel(val);
} else {
dst[c] = above[2 * bs - 1];
}
}
dst += stride;
}
}
......@@ -305,33 +372,33 @@ static void dr_prediction_z1(uint8_t *dst, ptrdiff_t stride, int bs,
static void dr_prediction_z2(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left,
int dx, int dy, INTRA_FILTER filter_type) {
int r, c, x, y, shift, val, base;
int r, c, x, y, shift1, shift2, val, base1, base2, use_above;
assert(dx > 0);
assert(dy > 0);
for (r = 0; r < bs; ++r) {
for (c = 0; c < bs; ++c) {
y = r + 1;
x = (c << 8) - y * dx;
base = x >> 8;
if (base >= -1) {
shift = x - (base << 8);
val = intra_subpel_interp(base, shift, above, -1, bs - 1, filter_type);
x = -dx;
for (r = 0; r < bs; ++r, x -= dx, dst += stride) {
use_above = 0;
base1 = x >> 8;
y = (r << 8) - dy;
for (c = 0; c < bs; ++c, ++base1, y -= dy) {
if (base1 >= -1) {
shift1 = x & 0xFF;
val = intra_subpel_interp(base1, shift1, above, -1, bs - 1,
filter_type);
} else {
x = c + 1;
y = (r << 8) - x * dy;
base = y >> 8;
if (base >= 0) {
shift = y - (base << 8);
val = intra_subpel_interp(base, shift, left, 0, bs - 1, filter_type);
base2 = y >> 8;
if (base2 >= 0) {
shift2 = y & 0xFF;
val = intra_subpel_interp(base2, shift2, left, 0, bs - 1,
filter_type);
} else {
val = left[0];
}
}
dst[c] = clip_pixel(val);
}
dst += stride;
}
}
......@@ -339,7 +406,7 @@ static void dr_prediction_z2(uint8_t *dst, ptrdiff_t stride, int bs,
static void dr_prediction_z3(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left,
int dx, int dy, INTRA_FILTER filter_type) {
int r, c, x, y, base, shift, val;
int r, c, y, base, shift, val;
(void)above;
(void)dx;
......@@ -347,21 +414,94 @@ static void dr_prediction_z3(uint8_t *dst, ptrdiff_t stride, int bs,
assert(dx == 1);
assert(dy < 0);
for (r = 0; r < bs; ++r) {
for (c = 0; c < bs; ++c) {
x = c + 1;
y = (r << 8) - x * dy;
if (filter_type != INTRA_FILTER_LINEAR) {
const int pad_size = SUBPEL_TAPS >> 1;
int len, i;
DECLARE_ALIGNED(16, uint8_t, buf[64][4 * SUBPEL_SHIFTS]);
DECLARE_ALIGNED(16, uint8_t, src[(64 + SUBPEL_TAPS) * 4]);
uint8_t flags[SUBPEL_SHIFTS];
memset(flags, 0, SUBPEL_SHIFTS * sizeof(flags[0]));
for (i = 0; i < pad_size; ++i)
src[4 * i] = left[0];
for (i = 0; i < 2 * bs; ++i)
src[4 * (i + pad_size)] = left[i];
for (i = 0; i < pad_size; ++i)
src[4 * (i + 2 * bs + pad_size)] = left[2 * bs - 1];
flags[0] = 1;
y = -dy;
for (c = 0; c < bs; ++c, y -= dy) {
base = y >> 8;
shift = y - (base << 8);
shift = y & 0xFF;
shift = ROUND_POWER_OF_TWO(shift, 8 - SUBPEL_BITS);
if (shift == SUBPEL_SHIFTS) {
base += 1;
shift = 0;
}
len = VPXMIN(bs, 2 * bs - 1 - base);
if (len <= 0) {
for (i = r; r < bs; ++r) {
dst[i * stride + c] = left[ 2 * bs - 1];
}
continue;
}
if (len <= (bs >> 1) && !flags[shift]) {
base = y >> 8;
shift = y & 0xFF;
for (r = 0; r < len; ++r) {
val = intra_subpel_interp(base, shift, left, 0, 2 * bs - 1,
filter_type);
dst[r * stride + c] = clip_pixel(val);
++base;
}
} else {
if (!flags[shift]) {
vpx_convolve8_vert(src + 4 * pad_size, 4,
buf[0] + 4 * shift, 4 * SUBPEL_SHIFTS, NULL, 16,
vp10_intra_filter_kernels[filter_type][shift], 16,
2 * bs < 16 ? 4 : 4, 2 * bs);
flags[shift] = 1;
}
if (shift == 0) {
for (r = 0; r < len; ++r) {
dst[r * stride + c] = left[r + base];
}
} else {
for (r = 0; r < len; ++r) {
dst[r * stride + c] = buf[r + base][4 * shift];
}
}
}
if (len < bs) {
for (r = len; r < bs; ++r) {
dst[r * stride + c] = left[ 2 * bs - 1];
}
}
}
return;
}
// For linear filter, C code is faster.
y = -dy;
for (c = 0; c < bs; ++c, y -= dy) {
base = y >> 8;
shift = y & 0xFF;
for (r = 0; r < bs; ++r, ++base) {
if (base < 2 * bs - 1) {
val = intra_subpel_interp(base, shift, left, 0, 2 * bs - 1,
filter_type);
dst[c] = clip_pixel(val);
val = left[base] * (256 - shift) + left[base + 1] * shift;
val = ROUND_POWER_OF_TWO(val, 8);
dst[r * stride + c] = clip_pixel(val);
} else {
dst[c] = left[ 2 * bs - 1];
for (; r < bs; ++r)
dst[r * stride + c] = left[2 * bs - 1];
break;
}
}
dst += stride;
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment