Commit 5ade4237 authored by Deb Mukherjee's avatar Deb Mukherjee

Removes conditional statements from band getting

Implements scan order to band map with arrays in both the encoder
and decoder to remove conditional statements.

Encoding seems to be about 1% faster at speed 0, tested on football.
Decoding seems to be about 0.5-1% faster on a set of 25 videos.

Change-Id: Idb233ca0b9e0efd790e30880642e8717e1c5c8dd
parent e5ed605f
......@@ -127,12 +127,6 @@ static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
extern const uint8_t vp9_coefband_trans_8x8plus[MAXBAND_INDEX + 1];
extern const uint8_t vp9_coefband_trans_4x4[MAXBAND_INDEX + 1];
static int get_coef_band(const uint8_t * band_translate, int coef_index) {
return (coef_index > MAXBAND_INDEX)
? (COEF_BANDS-1) : band_translate[coef_index];
}
// 128 lists of probabilities are stored for the following ONE node probs:
// 1, 3, 5, 7, ..., 253, 255
// In between probabilities are interpolated linearly
......@@ -181,11 +175,6 @@ static int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
return combine_entropy_contexts(above_ec, left_ec);
}
static const uint8_t *get_band_translate(TX_SIZE tx_size) {
return tx_size == TX_4X4 ? vp9_coefband_trans_4x4
: vp9_coefband_trans_8x8plus;
}
static void get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
PLANE_TYPE type, int block_idx,
const int16_t **scan, const int16_t **scan_nb) {
......
......@@ -45,6 +45,7 @@ typedef struct TileWorkerData {
DECLARE_ALIGNED(16, int16_t, qcoeff[MAX_MB_PLANE][64 * 64]);
DECLARE_ALIGNED(16, int16_t, dqcoeff[MAX_MB_PLANE][64 * 64]);
DECLARE_ALIGNED(16, uint16_t, eobs[MAX_MB_PLANE][256]);
const uint8_t *band_translate[2];
} TileWorkerData;
static int read_be32(const uint8_t *p) {
......@@ -294,7 +295,8 @@ struct intra_args {
VP9_COMMON *cm;
MACROBLOCKD *xd;
vp9_reader *r;
unsigned char* token_cache;
uint8_t *token_cache;
const uint8_t *band_translate[2];
};
static void predict_and_reconstruct_intra_block(int plane, int block,
......@@ -303,6 +305,9 @@ static void predict_and_reconstruct_intra_block(int plane, int block,
struct intra_args *const args = arg;
VP9_COMMON *const cm = args->cm;
MACROBLOCKD *const xd = args->xd;
const uint8_t *band_translate[2] = {
args->band_translate[0], args->band_translate[1]
};
struct macroblockd_plane *const pd = &xd->plane[plane];
MODE_INFO *const mi = xd->mi_8x8[0];
......@@ -324,7 +329,7 @@ static void predict_and_reconstruct_intra_block(int plane, int block,
if (!mi->mbmi.skip_coeff) {
vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, tx_size,
args->r, args->token_cache);
args->r, args->token_cache, band_translate);
inverse_transform_block(xd, plane, block, plane_bsize, tx_size);
}
}
......@@ -334,7 +339,8 @@ struct inter_args {
MACROBLOCKD *xd;
vp9_reader *r;
int *eobtotal;
unsigned char* token_cache;
uint8_t *token_cache;
const uint8_t *band_translate[2];
};
static void reconstruct_inter_block(int plane, int block,
......@@ -343,10 +349,14 @@ static void reconstruct_inter_block(int plane, int block,
struct inter_args *args = arg;
VP9_COMMON *const cm = args->cm;
MACROBLOCKD *const xd = args->xd;
const uint8_t *band_translate[2] = {
args->band_translate[0], args->band_translate[1]
};
*args->eobtotal += vp9_decode_block_tokens(cm, xd, plane, block,
plane_bsize, tx_size,
args->r, args->token_cache);
args->r, args->token_cache,
band_translate);
inverse_transform_block(xd, plane, block, plane_bsize, tx_size);
}
......@@ -398,7 +408,8 @@ static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd,
const TileInfo *const tile,
int mi_row, int mi_col,
vp9_reader *r, BLOCK_SIZE bsize,
unsigned char *token_cache) {
uint8_t *token_cache,
const uint8_t *band_translate[2]) {
const int less8x8 = bsize < BLOCK_8X8;
MB_MODE_INFO *mbmi;
......@@ -420,7 +431,9 @@ static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd,
}
if (!is_inter_block(mbmi)) {
struct intra_args arg = { cm, xd, r, token_cache };
struct intra_args arg = {
cm, xd, r, token_cache, {band_translate[0], band_translate[1]}
};
foreach_transformed_block(xd, bsize, predict_and_reconstruct_intra_block,
&arg);
} else {
......@@ -438,7 +451,10 @@ static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd,
// Reconstruction
if (!mbmi->skip_coeff) {
int eobtotal = 0;
struct inter_args arg = { cm, xd, r, &eobtotal, token_cache };
struct inter_args arg = {
cm, xd, r, &eobtotal, token_cache,
{band_translate[0], band_translate[1]}
};
foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg);
if (!less8x8 && eobtotal == 0)
mbmi->skip_coeff = 1; // skip loopfilter
......@@ -478,7 +494,8 @@ static void decode_modes_sb(VP9_COMMON *const cm, MACROBLOCKD *const xd,
const TileInfo *const tile,
int mi_row, int mi_col,
vp9_reader* r, BLOCK_SIZE bsize,
unsigned char *token_cache) {
uint8_t *token_cache,
const uint8_t *band_translate[2]) {
const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
PARTITION_TYPE partition;
BLOCK_SIZE subsize;
......@@ -489,33 +506,37 @@ static void decode_modes_sb(VP9_COMMON *const cm, MACROBLOCKD *const xd,
partition = read_partition(cm, xd, hbs, mi_row, mi_col, bsize, r);
subsize = get_subsize(bsize, partition);
if (subsize < BLOCK_8X8) {
decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache,
band_translate);
} else {
switch (partition) {
case PARTITION_NONE:
decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache,
band_translate);
break;
case PARTITION_HORZ:
decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache,
band_translate);
if (mi_row + hbs < cm->mi_rows)
decode_modes_b(cm, xd, tile, mi_row + hbs, mi_col, r, subsize,
token_cache);
token_cache, band_translate);
break;
case PARTITION_VERT:
decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache,
band_translate);
if (mi_col + hbs < cm->mi_cols)
decode_modes_b(cm, xd, tile, mi_row, mi_col + hbs, r, subsize,
token_cache);
token_cache, band_translate);
break;
case PARTITION_SPLIT:
decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, subsize,
token_cache);
token_cache, band_translate);
decode_modes_sb(cm, xd, tile, mi_row, mi_col + hbs, r, subsize,
token_cache);
token_cache, band_translate);
decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col, r, subsize,
token_cache);
token_cache, band_translate);
decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize,
token_cache);
token_cache, band_translate);
break;
default:
assert(!"Invalid partition type");
......@@ -798,9 +819,13 @@ static void decode_tile(VP9D_COMP *pbi, const TileInfo *const tile,
vp9_zero(xd->left_context);
vp9_zero(xd->left_seg_context);
for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
mi_col += MI_BLOCK_SIZE)
mi_col += MI_BLOCK_SIZE) {
const uint8_t *band_translate[2] = {
vp9_coefband_trans_4x4, pbi->coefband_trans_8x8plus
};
decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64,
pbi->token_cache);
pbi->token_cache, band_translate);
}
if (pbi->do_loopfilter_inline) {
const int lf_start = mi_row - MI_BLOCK_SIZE;
......@@ -948,7 +973,7 @@ static void setup_tile_macroblockd(TileWorkerData *const tile_data) {
}
static int tile_worker_hook(void *arg1, void *arg2) {
TileWorkerData *tile_data = (TileWorkerData*)arg1;
TileWorkerData *const tile_data = (TileWorkerData*)arg1;
const TileInfo *const tile = (TileInfo*)arg2;
int mi_row, mi_col;
......@@ -960,7 +985,8 @@ static int tile_worker_hook(void *arg1, void *arg2) {
mi_col += MI_BLOCK_SIZE) {
decode_modes_sb(tile_data->cm, &tile_data->xd, tile,
mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64,
tile_data->token_cache);
tile_data->token_cache,
tile_data->band_translate);
}
}
return !tile_data->xd.corrupted;
......@@ -1019,6 +1045,8 @@ static const uint8_t *decode_tiles_mt(VP9D_COMP *pbi, const uint8_t *data) {
tile_data->cm = cm;
tile_data->xd = pbi->mb;
tile_data->xd.corrupted = 0;
tile_data->band_translate[0] = vp9_coefband_trans_4x4;
tile_data->band_translate[1] = pbi->coefband_trans_8x8plus;
vp9_tile_init(tile, tile_data->cm, 0, tile_col);
setup_token_decoder(data, data_end, size, &cm->error,
......@@ -1299,6 +1327,13 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
const int tile_cols = 1 << cm->log2_tile_cols;
YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);
vpx_memset(pbi->coefband_trans_8x8plus,
(COEF_BANDS - 1),
sizeof(pbi->coefband_trans_8x8plus));
vpx_memcpy(pbi->coefband_trans_8x8plus,
vp9_coefband_trans_8x8plus,
sizeof(vp9_coefband_trans_8x8plus));
if (!first_partition_size) {
// showing a frame directly
*p_data_end = data + 1;
......
......@@ -93,7 +93,8 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
vp9_reader *r, int block_idx,
PLANE_TYPE type, int seg_eob, int16_t *dqcoeff_ptr,
TX_SIZE tx_size, const int16_t *dq, int pt,
uint8_t *token_cache) {
uint8_t *token_cache,
const uint8_t *band_translate) {
const FRAME_CONTEXT *const fc = &cm->fc;
FRAME_COUNTS *const counts = &cm->counts;
const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi);
......@@ -108,31 +109,30 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
unsigned int (*eob_branch_count)[PREV_COEF_CONTEXTS] =
counts->eob_branch[tx_size][type][ref];
const int16_t *scan, *nb;
const uint8_t *const band_translate = get_band_translate(tx_size);
const uint8_t *cat6;
get_scan(xd, tx_size, type, block_idx, &scan, &nb);
while (1) {
while (c < seg_eob) {
int val;
const uint8_t *cat6 = cat6_prob;
if (c >= seg_eob)
break;
if (c)
pt = get_coef_context(nb, token_cache, c);
band = get_coef_band(band_translate, c);
band = *band_translate++;
prob = coef_probs[band][pt];
if (!cm->frame_parallel_decoding_mode)
++eob_branch_count[band][pt];
if (!vp9_read(r, prob[EOB_CONTEXT_NODE]))
break;
goto DECODE_ZERO;
SKIP_START:
if (c >= seg_eob)
break;
if (c)
pt = get_coef_context(nb, token_cache, c);
band = get_coef_band(band_translate, c);
band = *band_translate++;
prob = coef_probs[band][pt];
DECODE_ZERO:
if (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) {
INCREMENT_COUNT(ZERO_TOKEN);
token_cache[scan[c]] = vp9_pt_energy_class[ZERO_TOKEN];
......@@ -200,6 +200,7 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY5);
}
val = 0;
cat6 = cat6_prob;
while (*cat6) {
val = (val << 1) | vp9_read(r, *cat6++);
}
......@@ -218,7 +219,8 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
int plane, int block, BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, vp9_reader *r,
uint8_t *token_cache) {
uint8_t *token_cache,
const uint8_t *band_translate[2]) {
struct macroblockd_plane *const pd = &xd->plane[plane];
const int seg_eob = get_tx_eob(&cm->seg, xd->mi_8x8[0]->mbmi.segment_id,
tx_size);
......@@ -229,7 +231,8 @@ int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
eob = decode_coefs(cm, xd, r, block,
pd->plane_type, seg_eob, BLOCK_OFFSET(pd->dqcoeff, block),
tx_size, pd->dequant, pt, token_cache);
tx_size, pd->dequant, pt, token_cache,
band_translate[tx_size != TX_4X4]);
set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, aoff, loff);
......
......@@ -18,6 +18,7 @@
int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
int plane, int block, BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, vp9_reader *r,
uint8_t *token_cache);
uint8_t *token_cache,
const uint8_t *band_translate[2]);
#endif // VP9_DECODER_VP9_DETOKENIZE_H_
......@@ -54,7 +54,8 @@ typedef struct VP9Decompressor {
ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
PARTITION_CONTEXT *above_seg_context;
DECLARE_ALIGNED(16, unsigned char, token_cache[1024]);
DECLARE_ALIGNED(16, uint8_t, token_cache[1024]);
DECLARE_ALIGNED(16, uint8_t, coefband_trans_8x8plus[1024]);
} VP9D_COMP;
#endif // VP9_DECODER_VP9_ONYXD_INT_H_
......@@ -184,6 +184,9 @@ struct macroblock {
BLOCK_SIZE sb64_partitioning;
void (*fwd_txm4x4)(const int16_t *input, int16_t *output, int stride);
// band cache
DECLARE_ALIGNED(16, uint8_t, coefband_trans_8x8plus[1024]);
};
// TODO(jingning): the variables used here are little complicated. need further
......
......@@ -138,7 +138,9 @@ static void optimize_b(MACROBLOCK *mb,
uint8_t token_cache[1024];
const int ib = txfrm_block_to_raster_block(plane_bsize, tx_size, block);
const int16_t *dequant_ptr = pd->dequant;
const uint8_t *const band_translate = get_band_translate(tx_size);
const uint8_t *const band_translate = (tx_size == TX_4X4 ?
vp9_coefband_trans_4x4 :
mb->coefband_trans_8x8plus);
assert((!type && !plane) || (type && plane));
dqcoeff_ptr = BLOCK_OFFSET(pd->dqcoeff, block);
......@@ -179,7 +181,7 @@ static void optimize_b(MACROBLOCK *mb,
t0 = (vp9_dct_value_tokens_ptr + x)->token;
/* Consider both possible successor states. */
if (next < default_eob) {
band = get_coef_band(band_translate, i + 1);
band = band_translate[i + 1];
pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
rate0 +=
mb->token_costs[tx_size][type][ref][band][0][pt]
......@@ -230,7 +232,7 @@ static void optimize_b(MACROBLOCK *mb,
t0 = t1 = (vp9_dct_value_tokens_ptr + x)->token;
}
if (next < default_eob) {
band = get_coef_band(band_translate, i + 1);
band = band_translate[i + 1];
if (t0 != DCT_EOB_TOKEN) {
pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
rate0 += mb->token_costs[tx_size][type][ref][band][!x][pt]
......@@ -264,7 +266,7 @@ static void optimize_b(MACROBLOCK *mb,
/* There's no choice to make for a zero coefficient, so we don't
* add a new trellis node, but we do need to update the costs.
*/
band = get_coef_band(band_translate, i + 1);
band = band_translate[i + 1];
t0 = tokens[next][0].token;
t1 = tokens[next][1].token;
/* Update the cost of each path if we're past the EOB token. */
......@@ -284,7 +286,7 @@ static void optimize_b(MACROBLOCK *mb,
}
/* Now pick the best path through the whole trellis. */
band = get_coef_band(band_translate, i + 1);
band = band_translate[i + 1];
pt = combine_entropy_contexts(*a, *l);
rate0 = tokens[next][0].rate;
rate1 = tokens[next][1].rate;
......
......@@ -1223,6 +1223,13 @@ static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
cpi->fixed_divide[0] = 0;
for (i = 1; i < 512; i++)
cpi->fixed_divide[i] = 0x80000 / i;
vpx_memset(cpi->mb.coefband_trans_8x8plus,
(COEF_BANDS-1),
sizeof(cpi->mb.coefband_trans_8x8plus));
vpx_memcpy(cpi->mb.coefband_trans_8x8plus,
vp9_coefband_trans_8x8plus,
sizeof(vp9_coefband_trans_8x8plus));
}
......
......@@ -115,7 +115,9 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
vp9_coeff_count *const counts = cpi->coef_counts[tx_size];
vp9_coeff_probs_model *const coef_probs = cpi->common.fc.coef_probs[tx_size];
const int ref = is_inter_block(mbmi);
const uint8_t *const band_translate = get_band_translate(tx_size);
const uint8_t *const band_translate = (tx_size == TX_4X4 ?
vp9_coefband_trans_4x4 :
cpi->mb.coefband_trans_8x8plus);
const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
int aoff, loff;
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
......@@ -127,7 +129,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
get_scan(xd, tx_size, type, block, &scan, &nb);
c = 0;
do {
const int band = get_coef_band(band_translate, c);
const int band = band_translate[c];
int token;
int v = 0;
rc = scan[c];
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment