Commit 93105538 authored by Debargha Mukherjee's avatar Debargha Mukherjee

Some optimizations on integer affine estimation

1. Adds a limit on number of candidate samples used for the
estimation.
2. Adds a limit on max mv magnitude for use in the least-squares
3. Makes some of the internal variables 32-bit.

Impact on coding efficiency in the noise range.

Change-Id: I8c1c3216368ceb2e3548660a3b8c159df54a8312
parent c20176e5
......@@ -46,6 +46,14 @@
(((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
: ROUND_POWER_OF_TWO((value), (n)))
/* Shift down with rounding for use when n >= 0, value >= 0 for (64 bit) */
#define ROUND_POWER_OF_TWO_64(value, n) \
(((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
/* Shift down with rounding for signed integers, for use when n >= 0 (64 bit) */
#define ROUND_POWER_OF_TWO_SIGNED_64(value, n) \
(((value) < 0) ? -ROUND_POWER_OF_TWO_64(-(value), (n)) \
: ROUND_POWER_OF_TWO_64((value), (n)))
#define ALIGN_POWER_OF_TWO(value, n) \
(((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
......
......@@ -1150,7 +1150,6 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
int mvnumber = 0;
int global_offset_c = mi_col * MI_SIZE;
int global_offset_r = mi_row * MI_SIZE;
int samples_per_neighbor = 4;
// scan the above row
if (up_available) {
......@@ -1169,7 +1168,7 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
int cr_offset = -AOMMAX(bh, MI_SIZE) / 2 - 1;
int cc_offset = i * MI_SIZE + AOMMAX(bw, MI_SIZE) / 2 - 1;
int j;
int pixelperblock = samples_per_neighbor;
int pixelperblock = SAMPLES_PER_NEIGHBOR;
mvasint[mvnumber] = mbmi->mv[0].as_int;
mvnumber++;
......@@ -1212,7 +1211,7 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
int cr_offset = i * MI_SIZE + AOMMAX(bh, MI_SIZE) / 2 - 1;
int cc_offset = -AOMMAX(bw, MI_SIZE) / 2 - 1;
int j;
int pixelperblock = samples_per_neighbor;
int pixelperblock = SAMPLES_PER_NEIGHBOR;
mvasint[mvnumber] = mbmi->mv[0].as_int;
mvnumber++;
......@@ -1251,7 +1250,7 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
int cr_offset = -AOMMAX(bh, MI_SIZE) / 2 - 1;
int cc_offset = -AOMMAX(bw, MI_SIZE) / 2 - 1;
int j;
int pixelperblock = samples_per_neighbor;
int pixelperblock = SAMPLES_PER_NEIGHBOR;
mvasint[mvnumber] = mbmi->mv[0].as_int;
mvnumber++;
......@@ -1292,7 +1291,7 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
int cr_offset = AOMMAX(bh, MI_SIZE) / 2 - 1;
int cc_offset = AOMMAX(bw, MI_SIZE) / 2 - 1;
int j;
int pixelperblock = samples_per_neighbor;
int pixelperblock = SAMPLES_PER_NEIGHBOR;
for (j = 0; j < pixelperblock; j++) {
int r_offset = j / 2;
......
......@@ -1216,16 +1216,21 @@ void av1_warp_plane(WarpedMotionParams *wm,
#if CONFIG_WARPED_MOTION
#define IDET_PREC_BITS 48
#define LEAST_SQUARES_SAMPLES_MAX 32
#define LEAST_SQUARES_MV_MAX 1024 // max mv in 1/8-pel
#define IDET_WARPEDMODEL_DIFF_BITS (IDET_PREC_BITS - WARPEDMODEL_PREC_BITS)
static int find_affine_int(const int np, int *pts1, int *pts2,
WarpedMotionParams *wm, int mi_row, int mi_col) {
int64_t A[3][3] = { { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 } };
int64_t C[3][3] = { { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 } };
int64_t Bx[3] = { 0, 0, 0 };
int64_t By[3] = { 0, 0, 0 };
int32_t A[3][3] = { { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 } };
int32_t Bx[3] = { 0, 0, 0 };
int32_t By[3] = { 0, 0, 0 };
int i, j, n = 0, off;
int64_t C00, C01, C02, C11, C12, C22;
int64_t Px[3], Py[3];
int64_t Det, iDet;
int i, off;
int64_t Det, iDet, v;
// Offsets to make the values in the arrays smaller
const int ux = mi_col * MI_SIZE * 8, uy = mi_row * MI_SIZE * 8;
// Let source points (xi, yi) map to destimation points (xi', yi'),
......@@ -1245,79 +1250,83 @@ static int find_affine_int(const int np, int *pts1, int *pts2,
// y' = h3.x + h4.y + dy
//
// The loop below computes: A = P'P, Bx = P'q, By = P'r
for (i = 0; i < np; ++i) {
const int dx = *(pts2++) - ux;
const int dy = *(pts2++) - uy;
const int sx = *(pts1++) - ux;
const int sy = *(pts1++) - uy;
A[0][0] += sx * sx;
A[0][1] += sx * sy;
A[0][2] += sx;
A[1][1] += sy * sy;
A[1][2] += sy;
A[2][2] += 1;
Bx[0] += sx * dx;
Bx[1] += sy * dx;
Bx[2] += dx;
By[0] += sx * dy;
By[1] += sy * dy;
By[2] += dy;
// We need to just compute inv(A).Bx and inv(A).By for the solutions.
//
for (j = 0; j < SAMPLES_PER_NEIGHBOR && n < LEAST_SQUARES_SAMPLES_MAX; ++j) {
for (i = j; i < np && n < LEAST_SQUARES_SAMPLES_MAX;
i += SAMPLES_PER_NEIGHBOR) {
const int dx = pts2[i * 2] - ux;
const int dy = pts2[i * 2 + 1] - uy;
const int sx = pts1[i * 2] - ux;
const int sy = pts1[i * 2 + 1] - uy;
if (abs(sx - dx) >= LEAST_SQUARES_MV_MAX ||
abs(sy - dy) >= LEAST_SQUARES_MV_MAX)
continue;
A[0][0] += sx * sx;
A[0][1] += sx * sy;
A[0][2] += sx;
A[1][1] += sy * sy;
A[1][2] += sy;
A[2][2] += 1;
Bx[0] += sx * dx;
Bx[1] += sy * dx;
Bx[2] += dx;
By[0] += sx * dy;
By[1] += sy * dy;
By[2] += dy;
n++;
}
}
// Compute Cofactors of A
C[0][0] = A[1][1] * A[2][2] - A[1][2] * A[1][2];
C[0][1] = A[1][2] * A[0][2] - A[0][1] * A[2][2];
C[0][2] = A[0][1] * A[1][2] - A[0][2] * A[1][1];
C[1][1] = A[0][0] * A[2][2] - A[0][2] * A[0][2];
C[1][2] = A[0][1] * A[0][2] - A[0][0] * A[1][2];
C[2][2] = A[0][0] * A[1][1] - A[0][1] * A[0][1];
// Compute Determinant of A
Det = C[0][0] * A[0][0] + C[0][1] * A[0][1] + C[0][2] * A[0][2];
// These are the least squares solutions but need scaling.
Px[0] = C[0][0] * Bx[0] + C[0][1] * Bx[1] + C[0][2] * Bx[2];
Px[1] = C[0][1] * Bx[0] + C[1][1] * Bx[1] + C[1][2] * Bx[2];
Px[2] = C[0][2] * Bx[0] + C[1][2] * Bx[1] + C[2][2] * Bx[2];
Py[0] = C[0][0] * By[0] + C[0][1] * By[1] + C[0][2] * By[2];
Py[1] = C[0][1] * By[0] + C[1][1] * By[1] + C[1][2] * By[2];
Py[2] = C[0][2] * By[0] + C[1][2] * By[1] + C[2][2] * By[2];
C00 = (int64_t)A[1][1] * A[2][2] - (int64_t)A[1][2] * A[1][2];
C01 = (int64_t)A[1][2] * A[0][2] - (int64_t)A[0][1] * A[2][2];
C02 = (int64_t)A[0][1] * A[1][2] - (int64_t)A[0][2] * A[1][1];
C11 = (int64_t)A[0][0] * A[2][2] - (int64_t)A[0][2] * A[0][2];
C12 = (int64_t)A[0][1] * A[0][2] - (int64_t)A[0][0] * A[1][2];
C22 = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
// Scale by 1/16
Px[0] = ROUND_POWER_OF_TWO_SIGNED(Px[0], 4);
Px[1] = ROUND_POWER_OF_TWO_SIGNED(Px[1], 4);
Px[2] = ROUND_POWER_OF_TWO_SIGNED(Px[2], 4);
Py[0] = ROUND_POWER_OF_TWO_SIGNED(Py[0], 4);
Py[1] = ROUND_POWER_OF_TWO_SIGNED(Py[1], 4);
Py[2] = ROUND_POWER_OF_TWO_SIGNED(Py[2], 4);
Det = ROUND_POWER_OF_TWO_SIGNED(Det, 4);
C00 = ROUND_POWER_OF_TWO_SIGNED(C00, 6);
C01 = ROUND_POWER_OF_TWO_SIGNED(C01, 6);
C02 = ROUND_POWER_OF_TWO_SIGNED(C02, 6);
C11 = ROUND_POWER_OF_TWO_SIGNED(C11, 6);
C12 = ROUND_POWER_OF_TWO_SIGNED(C12, 6);
C22 = ROUND_POWER_OF_TWO_SIGNED(C22, 6);
// Compute Determinant of A
Det = C00 * A[0][0] + C01 * A[0][1] + C02 * A[0][2];
if (Det == 0) return 1;
// These divided by the Det, are the least squares solutions
Px[0] = C00 * Bx[0] + C01 * Bx[1] + C02 * Bx[2];
Px[1] = C01 * Bx[0] + C11 * Bx[1] + C12 * Bx[2];
Px[2] = C02 * Bx[0] + C12 * Bx[1] + C22 * Bx[2];
Py[0] = C00 * By[0] + C01 * By[1] + C02 * By[2];
Py[1] = C01 * By[0] + C11 * By[1] + C12 * By[2];
Py[2] = C02 * By[0] + C12 * By[1] + C22 * By[2];
// Compute inverse of the Determinant
// TODO(debargha, yuec): Try to remove this only division
// TODO(debargha, yuec): Try to remove this only division if possible
iDet = ((int64_t)1 << IDET_PREC_BITS) / Det;
wm->wmmat[2] = ((int64_t)Px[0] * (int64_t)iDet +
((int64_t)1 << (IDET_WARPEDMODEL_DIFF_BITS - 1))) >>
IDET_WARPEDMODEL_DIFF_BITS;
wm->wmmat[3] = ((int64_t)Px[1] * (int64_t)iDet +
((int64_t)1 << (IDET_WARPEDMODEL_DIFF_BITS - 1))) >>
IDET_WARPEDMODEL_DIFF_BITS;
wm->wmmat[0] = ((int64_t)Px[2] * (int64_t)iDet +
((int64_t)1 << (IDET_WARPEDMODEL_DIFF_BITS + 2))) >>
(IDET_WARPEDMODEL_DIFF_BITS + 3);
v = Px[0] * iDet;
wm->wmmat[2] = ROUND_POWER_OF_TWO_SIGNED_64(v, IDET_WARPEDMODEL_DIFF_BITS);
v = Px[1] * iDet;
wm->wmmat[3] = ROUND_POWER_OF_TWO_SIGNED_64(v, IDET_WARPEDMODEL_DIFF_BITS);
v = Px[2] * iDet;
wm->wmmat[0] =
ROUND_POWER_OF_TWO_SIGNED_64(v, IDET_WARPEDMODEL_DIFF_BITS + 3);
// Adjust x displacement for the offset
off = (ux << WARPEDMODEL_PREC_BITS) - ux * wm->wmmat[2] - uy * wm->wmmat[3];
wm->wmmat[0] += ROUND_POWER_OF_TWO_SIGNED(off, 3);
wm->wmmat[4] = ((int64_t)Py[0] * (int64_t)iDet +
((int64_t)1 << (IDET_WARPEDMODEL_DIFF_BITS - 1))) >>
IDET_WARPEDMODEL_DIFF_BITS;
wm->wmmat[5] = ((int64_t)Py[1] * (int64_t)iDet +
((int64_t)1 << (IDET_WARPEDMODEL_DIFF_BITS - 1))) >>
IDET_WARPEDMODEL_DIFF_BITS;
wm->wmmat[1] = ((int64_t)Py[2] * (int64_t)iDet +
((int64_t)1 << (IDET_WARPEDMODEL_DIFF_BITS + 2))) >>
(IDET_WARPEDMODEL_DIFF_BITS + 3);
v = Py[0] * iDet;
wm->wmmat[4] = ROUND_POWER_OF_TWO_SIGNED_64(v, IDET_WARPEDMODEL_DIFF_BITS);
v = Py[1] * iDet;
wm->wmmat[5] = ROUND_POWER_OF_TWO_SIGNED_64(v, IDET_WARPEDMODEL_DIFF_BITS);
v = Py[2] * iDet;
wm->wmmat[1] =
ROUND_POWER_OF_TWO_SIGNED_64(v, IDET_WARPEDMODEL_DIFF_BITS + 3);
// Adjust y displacement for the offset
off = (uy << WARPEDMODEL_PREC_BITS) - ux * wm->wmmat[4] - uy * wm->wmmat[5];
wm->wmmat[1] += ROUND_POWER_OF_TWO_SIGNED(off, 3);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment