Commit 26b6318d authored by Ronald S. Bultje's avatar Ronald S. Bultje
Browse files

Make get_coef_context() branchless.

This should significantly speedup cost_coeffs(). Basically what the
patch does is to make the neighbour arrays padded by one item to
prevent an eob check in get_coef_context(), then it populates each
col/row scan and left/top edge coefficient with two times the same
neighbour - this prevents a single/double context branch in
get_coef_context(). Lastly, it populates neighbour arrays in pixel
order (rather than scan order), so we don't have to dereference the
scantable to get the correct neighbours.

Total encoding time of first 50 frames of bus (speed 0) at 1500kbps
goes from 2min10.1 to 2min5.3, i.e. a 2.6% overall speed increase.

Change-Id: I42bcd2210fd7bec03767ef0e2945a665b851df56
parent c8defcfd
......@@ -461,25 +461,25 @@ void vp9_default_coef_probs(VP9_COMMON *pc) {
// for each position in raster scan order.
// -1 indicates the neighbor does not exist.
DECLARE_ALIGNED(16, int16_t,
vp9_default_scan_4x4_neighbors[16 * MAX_NEIGHBORS]);
vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
DECLARE_ALIGNED(16, int16_t,
vp9_col_scan_4x4_neighbors[16 * MAX_NEIGHBORS]);
vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
DECLARE_ALIGNED(16, int16_t,
vp9_row_scan_4x4_neighbors[16 * MAX_NEIGHBORS]);
vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
DECLARE_ALIGNED(16, int16_t,
vp9_col_scan_8x8_neighbors[64 * MAX_NEIGHBORS]);
vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
DECLARE_ALIGNED(16, int16_t,
vp9_row_scan_8x8_neighbors[64 * MAX_NEIGHBORS]);
vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
DECLARE_ALIGNED(16, int16_t,
vp9_default_scan_8x8_neighbors[64 * MAX_NEIGHBORS]);
vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
DECLARE_ALIGNED(16, int16_t,
vp9_col_scan_16x16_neighbors[256 * MAX_NEIGHBORS]);
vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
DECLARE_ALIGNED(16, int16_t,
vp9_row_scan_16x16_neighbors[256 * MAX_NEIGHBORS]);
vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
DECLARE_ALIGNED(16, int16_t,
vp9_default_scan_16x16_neighbors[256 * MAX_NEIGHBORS]);
vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
DECLARE_ALIGNED(16, int16_t,
vp9_default_scan_32x32_neighbors[1024 * MAX_NEIGHBORS]);
vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
......@@ -504,15 +504,17 @@ static int find_in_scan(const int16_t *scan, int l, int idx) {
}
static void init_scan_neighbors(const int16_t *scan,
int16_t *iscan,
int l, int16_t *neighbors,
int max_neighbors) {
int l, int16_t *neighbors) {
int l2 = l * l;
int n, i, j;
for (n = 0; n < l2; n++) {
// dc doesn't use this type of prediction
neighbors[MAX_NEIGHBORS * 0 + 0] = 0;
neighbors[MAX_NEIGHBORS * 0 + 1] = 0;
iscan[0] = find_in_scan(scan, l, 0);
for (n = 1; n < l2; n++) {
int rc = scan[n];
iscan[n] = find_in_scan(scan, l, n);
assert(max_neighbors == MAX_NEIGHBORS);
i = rc / l;
j = rc % l;
if (i > 0 && j > 0) {
......@@ -524,93 +526,84 @@ static void init_scan_neighbors(const int16_t *scan,
// Therefore, if we use ADST/DCT, prefer the DCT neighbor coeff
// as a context. If ADST or DCT is used in both directions, we
// use the combination of the two as a context.
int a = find_in_scan(scan, l, (i - 1) * l + j);
int b = find_in_scan(scan, l, i * l + j - 1);
int a = (i - 1) * l + j;
int b = i * l + j - 1;
if (scan == vp9_col_scan_4x4 || scan == vp9_col_scan_8x8 ||
scan == vp9_col_scan_16x16) {
neighbors[max_neighbors * n + 0] = a;
neighbors[max_neighbors * n + 1] = -1;
// in the col/row scan cases (as well as left/top edge cases), we set
// both contexts to the same value, so we can branchlessly do a+b+1>>1
// which automatically becomes a if a == b
neighbors[MAX_NEIGHBORS * n + 0] =
neighbors[MAX_NEIGHBORS * n + 1] = a;
} else if (scan == vp9_row_scan_4x4 || scan == vp9_row_scan_8x8 ||
scan == vp9_row_scan_16x16) {
neighbors[max_neighbors * n + 0] = b;
neighbors[max_neighbors * n + 1] = -1;
neighbors[MAX_NEIGHBORS * n + 0] =
neighbors[MAX_NEIGHBORS * n + 1] = b;
} else {
neighbors[max_neighbors * n + 0] = a;
neighbors[max_neighbors * n + 1] = b;
neighbors[MAX_NEIGHBORS * n + 0] = a;
neighbors[MAX_NEIGHBORS * n + 1] = b;
}
} else if (i > 0) {
neighbors[max_neighbors * n + 0] = find_in_scan(scan, l, (i - 1) * l + j);
neighbors[max_neighbors * n + 1] = -1;
} else if (j > 0) {
neighbors[max_neighbors * n + 0] =
find_in_scan(scan, l, i * l + j - 1);
neighbors[max_neighbors * n + 1] = -1;
neighbors[MAX_NEIGHBORS * n + 0] =
neighbors[MAX_NEIGHBORS * n + 1] = (i - 1) * l + j;
} else {
assert(n == 0);
// dc predictor doesn't use previous tokens
neighbors[max_neighbors * n + 0] = -1;
assert(j > 0);
neighbors[MAX_NEIGHBORS * n + 0] =
neighbors[MAX_NEIGHBORS * n + 1] = i * l + j - 1;
}
assert(neighbors[max_neighbors * n + 0] < n);
assert(iscan[neighbors[MAX_NEIGHBORS * n + 0]] < n);
}
// one padding item so we don't have to add branches in code to handle
// calls to get_coef_context() for the token after the final dc token
neighbors[MAX_NEIGHBORS * l2 + 0] = 0;
neighbors[MAX_NEIGHBORS * l2 + 1] = 0;
}
void vp9_init_neighbors() {
init_scan_neighbors(vp9_default_scan_4x4, vp9_default_iscan_4x4, 4,
vp9_default_scan_4x4_neighbors, MAX_NEIGHBORS);
vp9_default_scan_4x4_neighbors);
init_scan_neighbors(vp9_row_scan_4x4, vp9_row_iscan_4x4, 4,
vp9_row_scan_4x4_neighbors, MAX_NEIGHBORS);
vp9_row_scan_4x4_neighbors);
init_scan_neighbors(vp9_col_scan_4x4, vp9_col_iscan_4x4, 4,
vp9_col_scan_4x4_neighbors, MAX_NEIGHBORS);
vp9_col_scan_4x4_neighbors);
init_scan_neighbors(vp9_default_scan_8x8, vp9_default_iscan_8x8, 8,
vp9_default_scan_8x8_neighbors, MAX_NEIGHBORS);
vp9_default_scan_8x8_neighbors);
init_scan_neighbors(vp9_row_scan_8x8, vp9_row_iscan_8x8, 8,
vp9_row_scan_8x8_neighbors, MAX_NEIGHBORS);
vp9_row_scan_8x8_neighbors);
init_scan_neighbors(vp9_col_scan_8x8, vp9_col_iscan_8x8, 8,
vp9_col_scan_8x8_neighbors, MAX_NEIGHBORS);
vp9_col_scan_8x8_neighbors);
init_scan_neighbors(vp9_default_scan_16x16, vp9_default_iscan_16x16, 16,
vp9_default_scan_16x16_neighbors, MAX_NEIGHBORS);
vp9_default_scan_16x16_neighbors);
init_scan_neighbors(vp9_row_scan_16x16, vp9_row_iscan_16x16, 16,
vp9_row_scan_16x16_neighbors, MAX_NEIGHBORS);
vp9_row_scan_16x16_neighbors);
init_scan_neighbors(vp9_col_scan_16x16, vp9_col_iscan_16x16, 16,
vp9_col_scan_16x16_neighbors, MAX_NEIGHBORS);
vp9_col_scan_16x16_neighbors);
init_scan_neighbors(vp9_default_scan_32x32, vp9_default_iscan_32x32, 32,
vp9_default_scan_32x32_neighbors, MAX_NEIGHBORS);
vp9_default_scan_32x32_neighbors);
}
const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan, int *pad) {
const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan) {
if (scan == vp9_default_scan_4x4) {
*pad = MAX_NEIGHBORS;
return vp9_default_scan_4x4_neighbors;
} else if (scan == vp9_row_scan_4x4) {
*pad = MAX_NEIGHBORS;
return vp9_row_scan_4x4_neighbors;
} else if (scan == vp9_col_scan_4x4) {
*pad = MAX_NEIGHBORS;
return vp9_col_scan_4x4_neighbors;
} else if (scan == vp9_default_scan_8x8) {
*pad = MAX_NEIGHBORS;
return vp9_default_scan_8x8_neighbors;
} else if (scan == vp9_row_scan_8x8) {
*pad = 2;
return vp9_row_scan_8x8_neighbors;
} else if (scan == vp9_col_scan_8x8) {
*pad = 2;
return vp9_col_scan_8x8_neighbors;
} else if (scan == vp9_default_scan_16x16) {
*pad = MAX_NEIGHBORS;
return vp9_default_scan_16x16_neighbors;
} else if (scan == vp9_row_scan_16x16) {
*pad = 2;
return vp9_row_scan_16x16_neighbors;
} else if (scan == vp9_col_scan_16x16) {
*pad = 2;
return vp9_col_scan_16x16_neighbors;
} else if (scan == vp9_default_scan_32x32) {
*pad = MAX_NEIGHBORS;
return vp9_default_scan_32x32_neighbors;
} else {
assert(0);
return NULL;
assert(scan == vp9_default_scan_32x32);
return vp9_default_scan_32x32_neighbors;
}
}
......
......@@ -166,28 +166,14 @@ static int get_coef_band(const uint8_t * band_translate, int coef_index) {
}
#define MAX_NEIGHBORS 2
static INLINE int get_coef_context(const int16_t *scan,
const int16_t *neighbors,
int nb_pad, uint8_t *token_cache,
int c, int l) {
int eob = l;
assert(nb_pad == MAX_NEIGHBORS);
if (c == eob) {
return 0;
} else {
int ctx;
assert(neighbors[MAX_NEIGHBORS * c + 0] >= 0);
if (neighbors[MAX_NEIGHBORS * c + 1] >= 0) {
ctx = (1 + token_cache[scan[neighbors[MAX_NEIGHBORS * c + 0]]] +
token_cache[scan[neighbors[MAX_NEIGHBORS * c + 1]]]) >> 1;
} else {
ctx = token_cache[scan[neighbors[MAX_NEIGHBORS * c + 0]]];
}
return ctx;
}
static INLINE int get_coef_context(const int16_t *neighbors,
uint8_t *token_cache,
int c) {
return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] +
token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;
}
const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan, int *pad);
const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan);
// 128 lists of probabilities are stored for the following ONE node probs:
......
......@@ -97,7 +97,7 @@ static int decode_coefs(FRAME_CONTEXT *fc, const MACROBLOCKD *xd,
TX_SIZE txfm_size, const int16_t *dq,
ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) {
ENTROPY_CONTEXT above_ec, left_ec;
int pt, c = 0, pad, default_eob;
int pt, c = 0;
int band;
vp9_prob (*coef_probs)[PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES];
vp9_prob coef_probs_full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
......@@ -130,7 +130,6 @@ static int decode_coefs(FRAME_CONTEXT *fc, const MACROBLOCKD *xd,
scan = get_scan_4x4(tx_type);
above_ec = A[0] != 0;
left_ec = L[0] != 0;
default_eob = 16;
band_translate = vp9_coefband_trans_4x4;
break;
}
......@@ -140,7 +139,6 @@ static int decode_coefs(FRAME_CONTEXT *fc, const MACROBLOCKD *xd,
scan = get_scan_8x8(tx_type);
above_ec = (A[0] + A[1]) != 0;
left_ec = (L[0] + L[1]) != 0;
default_eob = 64;
band_translate = vp9_coefband_trans_8x8plus;
break;
}
......@@ -150,7 +148,6 @@ static int decode_coefs(FRAME_CONTEXT *fc, const MACROBLOCKD *xd,
scan = get_scan_16x16(tx_type);
above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;
left_ec = (L[0] + L[1] + L[2] + L[3]) != 0;
default_eob = 256;
band_translate = vp9_coefband_trans_8x8plus;
break;
}
......@@ -158,13 +155,12 @@ static int decode_coefs(FRAME_CONTEXT *fc, const MACROBLOCKD *xd,
scan = vp9_default_scan_32x32;
above_ec = (A[0] + A[1] + A[2] + A[3] + A[4] + A[5] + A[6] + A[7]) != 0;
left_ec = (L[0] + L[1] + L[2] + L[3] + L[4] + L[5] + L[6] + L[7]) != 0;
default_eob = 1024;
band_translate = vp9_coefband_trans_8x8plus;
break;
}
pt = combine_entropy_contexts(above_ec, left_ec);
nb = vp9_get_coef_neighbors_handle(scan, &pad);
nb = vp9_get_coef_neighbors_handle(scan);
while (1) {
int val;
......@@ -172,8 +168,7 @@ static int decode_coefs(FRAME_CONTEXT *fc, const MACROBLOCKD *xd,
if (c >= seg_eob)
break;
if (c)
pt = get_coef_context(scan, nb, pad, token_cache,
c, default_eob);
pt = get_coef_context(nb, token_cache, c);
band = get_coef_band(band_translate, c);
prob = coef_probs[band][pt];
#if !CONFIG_BALANCED_COEFTREE
......@@ -186,8 +181,7 @@ SKIP_START:
if (c >= seg_eob)
break;
if (c)
pt = get_coef_context(scan, nb, pad, token_cache,
c, default_eob);
pt = get_coef_context(nb, token_cache, c);
band = get_coef_band(band_translate, c);
prob = coef_probs[band][pt];
......
......@@ -112,11 +112,10 @@ static const int plane_rd_mult[4] = {
static int trellis_get_coeff_context(const int16_t *scan,
const int16_t *nb,
int idx, int token,
uint8_t *token_cache,
int pad, int l) {
uint8_t *token_cache) {
int bak = token_cache[scan[idx]], pt;
token_cache[scan[idx]] = vp9_pt_energy_class[token];
pt = get_coef_context(scan, nb, pad, token_cache, idx + 1, l);
pt = get_coef_context(nb, token_cache, idx + 1);
token_cache[scan[idx]] = bak;
return pt;
}
......@@ -141,7 +140,7 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
int best, band, pt;
PLANE_TYPE type = xd->plane[plane].plane_type;
int err_mult = plane_rd_mult[type];
int default_eob, pad;
int default_eob;
const int16_t *scan, *nb;
const int mul = 1 + (tx_size == TX_32X32);
uint8_t token_cache[1024];
......@@ -201,7 +200,7 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
for (i = 0; i < eob; i++)
token_cache[scan[i]] = vp9_pt_energy_class[vp9_dct_value_tokens_ptr[
qcoeff_ptr[scan[i]]].token];
nb = vp9_get_coef_neighbors_handle(scan, &pad);
nb = vp9_get_coef_neighbors_handle(scan);
for (i = eob; i-- > i0;) {
int base_bits, d2, dx;
......@@ -220,8 +219,7 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
/* Consider both possible successor states. */
if (next < default_eob) {
band = get_coef_band(band_translate, i + 1);
pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache,
pad, default_eob);
pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
rate0 +=
mb->token_costs[tx_size][type][ref][0][band][pt]
[tokens[next][0].token];
......@@ -273,14 +271,12 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
if (next < default_eob) {
band = get_coef_band(band_translate, i + 1);
if (t0 != DCT_EOB_TOKEN) {
pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache,
pad, default_eob);
pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
rate0 += mb->token_costs[tx_size][type][ref][!x][band][pt]
[tokens[next][0].token];
}
if (t1 != DCT_EOB_TOKEN) {
pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache,
pad, default_eob);
pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache);
rate1 += mb->token_costs[tx_size][type][ref][!x][band][pt]
[tokens[next][1].token];
}
......
......@@ -304,7 +304,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
int pt;
int c = 0;
int cost = 0, pad;
int cost = 0;
const int16_t *scan, *nb;
const int eob = xd->plane[plane].eobs[block];
const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);
......@@ -314,7 +314,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
ENTROPY_CONTEXT above_ec, left_ec;
TX_TYPE tx_type = DCT_DCT;
const int segment_id = xd->mode_info_context->mbmi.segment_id;
int seg_eob, default_eob;
int seg_eob;
uint8_t token_cache[1024];
const uint8_t * band_translate;
......@@ -372,8 +372,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
assert(eob <= seg_eob);
pt = combine_entropy_contexts(above_ec, left_ec);
nb = vp9_get_coef_neighbors_handle(scan, &pad);
default_eob = seg_eob;
nb = vp9_get_coef_neighbors_handle(scan);
if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))
seg_eob = 0;
......@@ -402,7 +401,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
v = qcoeff_ptr[rc];
t = vp9_dct_value_tokens_ptr[v].token;
pt = get_coef_context(scan, nb, pad, token_cache, c, default_eob);
pt = get_coef_context(nb, token_cache, c);
cost += token_costs[!prev_t][band][pt][t] + vp9_dct_value_cost_ptr[v];
token_cache[rc] = vp9_pt_energy_class[t];
prev_t = t;
......@@ -410,7 +409,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
// eob token
if (c < seg_eob) {
pt = get_coef_context(scan, nb, pad, token_cache, c, default_eob);
pt = get_coef_context(nb, token_cache, c);
cost += token_costs[0][get_coef_band(band_translate, c)][pt]
[DCT_EOB_TOKEN];
}
......
......@@ -123,7 +123,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
const int loff = (off >> mod) << tx_size;
ENTROPY_CONTEXT *A = xd->plane[plane].above_context + aoff;
ENTROPY_CONTEXT *L = xd->plane[plane].left_context + loff;
int seg_eob, default_eob, pad;
int seg_eob;
const int segment_id = mbmi->segment_id;
const int16_t *scan, *nb;
vp9_coeff_count *counts;
......@@ -178,8 +178,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
}
pt = combine_entropy_contexts(above_ec, left_ec);
nb = vp9_get_coef_neighbors_handle(scan, &pad);
default_eob = seg_eob;
nb = vp9_get_coef_neighbors_handle(scan);
if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))
seg_eob = 0;
......@@ -191,7 +190,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
int v = 0;
rc = scan[c];
if (c)
pt = get_coef_context(scan, nb, pad, token_cache, c, default_eob);
pt = get_coef_context(nb, token_cache, c);
if (c < eob) {
v = qcoeff_ptr[rc];
assert(-DCT_MAX_VALUE <= v && v < DCT_MAX_VALUE);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment