Commit 58616eb0 authored by David Barker's avatar David Barker Committed by Debargha Mukherjee

Further speedups to warp filter

* Calculate sx4, sy4 by truncation instead of rounding
* Move some repeated calculations out of the filter loop

This is expected to have a roughly neutral effect on BDRATE.
The speedup of each filter (SSE2, lowbd SSSE3, highbd SSSE3) is
7-10%, for a total speedup of 14-18% when considered together
with patches f7a5ee53 and 14b8112b.

Change-Id: I692f649202214c7ab53ecf81f81386f1503e2d20
parent 6496fe97
......@@ -1014,10 +1014,8 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
sx4 += alpha * (-4) + beta * (-4);
sy4 += gamma * (-4) + delta * (-4);
sx4 = ROUND_POWER_OF_TWO_SIGNED(sx4, WARP_PARAM_REDUCE_BITS) *
(1 << WARP_PARAM_REDUCE_BITS);
sy4 = ROUND_POWER_OF_TWO_SIGNED(sy4, WARP_PARAM_REDUCE_BITS) *
(1 << WARP_PARAM_REDUCE_BITS);
sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
// Horizontal filter
for (k = -7; k < 8; ++k) {
......@@ -1272,10 +1270,8 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
sx4 += alpha * (-4) + beta * (-4);
sy4 += gamma * (-4) + delta * (-4);
sx4 = ROUND_POWER_OF_TWO_SIGNED(sx4, WARP_PARAM_REDUCE_BITS) *
(1 << WARP_PARAM_REDUCE_BITS);
sy4 = ROUND_POWER_OF_TWO_SIGNED(sy4, WARP_PARAM_REDUCE_BITS) *
(1 << WARP_PARAM_REDUCE_BITS);
sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
// Horizontal filter
for (k = -7; k < 8; ++k) {
......
......@@ -68,38 +68,49 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
iy4 = y4 >> WARPEDMODEL_PREC_BITS;
sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
sx4 += alpha * (-4) + beta * (-4);
sy4 += gamma * (-4) + delta * (-4);
// Add in all the constant terms, including rounding and offset
sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
sx4 = ROUND_POWER_OF_TWO_SIGNED(sx4, WARP_PARAM_REDUCE_BITS) *
(1 << WARP_PARAM_REDUCE_BITS);
sy4 = ROUND_POWER_OF_TWO_SIGNED(sy4, WARP_PARAM_REDUCE_BITS) *
(1 << WARP_PARAM_REDUCE_BITS);
sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
// Horizontal filter
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
int iy = iy4 + k;
if (iy < 0)
iy = 0;
else if (iy > height - 1)
iy = height - 1;
// If the block is aligned such that, after clamping, every sample
// would be taken from the leftmost/rightmost column, then we can
// skip the expensive horizontal filter.
if (ix4 <= -7) {
// If the block is aligned such that, after clamping, every sample
// would be taken from the leftmost/rightmost column, then we can
// skip the expensive horizontal filter.
if (ix4 <= -7) {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
int iy = iy4 + k;
if (iy < 0)
iy = 0;
else if (iy > height - 1)
iy = height - 1;
tmp[k + 7] = _mm_set1_epi16(
ref[iy * stride] *
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
} else if (ix4 >= width + 6) {
}
} else if (ix4 >= width + 6) {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
int iy = iy4 + k;
if (iy < 0)
iy = 0;
else if (iy > height - 1)
iy = height - 1;
tmp[k + 7] = _mm_set1_epi16(
ref[iy * stride + (width - 1)] *
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
} else {
int sx = sx4 + beta * (k + 4) +
// Include rounding and offset here
(1 << (WARPEDDIFF_PREC_BITS - 1)) +
(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
}
} else {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
int iy = iy4 + k;
if (iy < 0)
iy = 0;
else if (iy > height - 1)
iy = height - 1;
int sx = sx4 + beta * (k + 4);
// Load source pixels
const __m128i src =
......@@ -203,8 +214,7 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
// Vertical filter
for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
int sy = sy4 + delta * (k + 4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
int sy = sy4 + delta * (k + 4);
// Load from tmp and rearrange pairs of consecutive rows into the
// column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
......
......@@ -63,38 +63,49 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
iy4 = y4 >> WARPEDMODEL_PREC_BITS;
sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
sx4 += alpha * (-4) + beta * (-4);
sy4 += gamma * (-4) + delta * (-4);
// Add in all the constant terms, including rounding and offset
sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
sx4 = ROUND_POWER_OF_TWO_SIGNED(sx4, WARP_PARAM_REDUCE_BITS) *
(1 << WARP_PARAM_REDUCE_BITS);
sy4 = ROUND_POWER_OF_TWO_SIGNED(sy4, WARP_PARAM_REDUCE_BITS) *
(1 << WARP_PARAM_REDUCE_BITS);
sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
// Horizontal filter
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
int iy = iy4 + k;
if (iy < 0)
iy = 0;
else if (iy > height - 1)
iy = height - 1;
// If the block is aligned such that, after clamping, every sample
// would be taken from the leftmost/rightmost column, then we can
// skip the expensive horizontal filter.
if (ix4 <= -7) {
// If the block is aligned such that, after clamping, every sample
// would be taken from the leftmost/rightmost column, then we can
// skip the expensive horizontal filter.
if (ix4 <= -7) {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
int iy = iy4 + k;
if (iy < 0)
iy = 0;
else if (iy > height - 1)
iy = height - 1;
tmp[k + 7] = _mm_set1_epi16(
ref[iy * stride] *
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
} else if (ix4 >= width + 6) {
}
} else if (ix4 >= width + 6) {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
int iy = iy4 + k;
if (iy < 0)
iy = 0;
else if (iy > height - 1)
iy = height - 1;
tmp[k + 7] = _mm_set1_epi16(
ref[iy * stride + (width - 1)] *
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
} else {
int sx = sx4 + beta * (k + 4) +
// Include rounding and offset here
(1 << (WARPEDDIFF_PREC_BITS - 1)) +
(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
}
} else {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
int iy = iy4 + k;
if (iy < 0)
iy = 0;
else if (iy > height - 1)
iy = height - 1;
int sx = sx4 + beta * (k + 4);
// Load source pixels
const __m128i zero = _mm_setzero_si128();
......@@ -198,8 +209,7 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
// Vertical filter
for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
int sy = sy4 + delta * (k + 4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
int sy = sy4 + delta * (k + 4);
// Load from tmp and rearrange pairs of consecutive rows into the
// column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
......
......@@ -250,38 +250,49 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
iy4 = y4 >> WARPEDMODEL_PREC_BITS;
sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
sx4 += alpha * (-4) + beta * (-4);
sy4 += gamma * (-4) + delta * (-4);
// Add in all the constant terms, including rounding and offset
sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
sx4 = ROUND_POWER_OF_TWO_SIGNED(sx4, WARP_PARAM_REDUCE_BITS) *
(1 << WARP_PARAM_REDUCE_BITS);
sy4 = ROUND_POWER_OF_TWO_SIGNED(sy4, WARP_PARAM_REDUCE_BITS) *
(1 << WARP_PARAM_REDUCE_BITS);
sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
// Horizontal filter
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
int iy = iy4 + k;
if (iy < 0)
iy = 0;
else if (iy > height - 1)
iy = height - 1;
// If the block is aligned such that, after clamping, every sample
// would be taken from the leftmost/rightmost column, then we can
// skip the expensive horizontal filter.
if (ix4 <= -7) {
// If the block is aligned such that, after clamping, every sample
// would be taken from the leftmost/rightmost column, then we can
// skip the expensive horizontal filter.
if (ix4 <= -7) {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
int iy = iy4 + k;
if (iy < 0)
iy = 0;
else if (iy > height - 1)
iy = height - 1;
tmp[k + 7] = _mm_set1_epi16(
ref[iy * stride] *
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
} else if (ix4 >= width + 6) {
}
} else if (ix4 >= width + 6) {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
int iy = iy4 + k;
if (iy < 0)
iy = 0;
else if (iy > height - 1)
iy = height - 1;
tmp[k + 7] = _mm_set1_epi16(
ref[iy * stride + (width - 1)] *
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
} else {
int sx = sx4 + beta * (k + 4) +
// Include rounding and offset here
(1 << (WARPEDDIFF_PREC_BITS - 1)) +
(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
}
} else {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
int iy = iy4 + k;
if (iy < 0)
iy = 0;
else if (iy > height - 1)
iy = height - 1;
int sx = sx4 + beta * (k + 4);
// Load source pixels
const __m128i src =
......@@ -375,8 +386,7 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
// Vertical filter
for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
int sy = sy4 + delta * (k + 4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
int sy = sy4 + delta * (k + 4);
// Load from tmp and rearrange pairs of consecutive rows into the
// column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment