Commit c5ddf06e authored by Alexander Bokov's avatar Alexander Bokov Committed by Yaowu Xu
Browse files

Add extra hashing mechanism on the TX size search level

In TX size search the rate and distortion values are often computed for
residuals that have already been seen before, with exactly the same
entropy context. Such repeated computations occur when the residue
signal is the same for 2 or more partitions for a given prediction mode.
By saving previous RD search results we can achieve some encoder
speed-up at the cost of higher RAM consumption (~20MB).

Test results with all default experiments:
Speedup      15% (measured on 4 lowres 30-frame sequences)
compression   0%

Change-Id: I8d9b8dd483cf862f0864b50ae80466beac978290
parent ad2c1782
......@@ -143,6 +143,28 @@ typedef struct {
CRC_CALCULATOR crc_calculator; // Hash function.
} TX_RD_RECORD;
typedef struct {
int64_t dist;
int rate;
uint8_t skip;
uint8_t entropy_context;
uint8_t valid;
uint8_t fast;
} TX_SIZE_RD_INFO;
#define TX_SIZE_RD_RECORD_BUFFER_LEN 256
typedef struct {
uint32_t hash_vals[TX_SIZE_RD_RECORD_BUFFER_LEN];
TX_SIZE_RD_INFO tx_rd_info[TX_SIZE_RD_RECORD_BUFFER_LEN][TX_TYPES];
int index_start;
int num;
} TX_SIZE_RD_RECORD;
typedef struct tx_size_rd_info_node {
TX_SIZE_RD_INFO *rd_info_array; // Points to array of size TX_TYPES.
struct tx_size_rd_info_node *children[4];
} TX_SIZE_RD_INFO_NODE;
typedef struct macroblock MACROBLOCK;
struct macroblock {
struct macroblock_plane plane[MAX_MB_PLANE];
......@@ -150,6 +172,18 @@ struct macroblock {
// Save the transform RD search info.
TX_RD_RECORD tx_rd_record;
// Also save RD info on the TX size search level for square TX sizes.
TX_SIZE_RD_RECORD
tx_size_rd_record_8X8[(MAX_MIB_SIZE >> 1) * (MAX_MIB_SIZE >> 1)];
TX_SIZE_RD_RECORD
tx_size_rd_record_16X16[(MAX_MIB_SIZE >> 2) * (MAX_MIB_SIZE >> 2)];
TX_SIZE_RD_RECORD
tx_size_rd_record_32X32[(MAX_MIB_SIZE >> 3) * (MAX_MIB_SIZE >> 3)];
#if CONFIG_TX64X64
TX_SIZE_RD_RECORD
tx_size_rd_record_64X64[(MAX_MIB_SIZE >> 4) * (MAX_MIB_SIZE >> 4)];
#endif
MACROBLOCKD e_mbd;
MB_MODE_INFO_EXT *mbmi_ext;
int skip_block;
......
......@@ -3358,6 +3358,13 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
}
x->tx_rd_record.num = x->tx_rd_record.index_start = 0;
av1_zero(x->tx_size_rd_record_8X8);
av1_zero(x->tx_size_rd_record_16X16);
av1_zero(x->tx_size_rd_record_32X32);
#if CONFIG_TX64X64
av1_zero(x->tx_size_rd_record_64X64);
#endif
av1_zero(x->pred_mv);
pc_root->index = 0;
......
......@@ -3616,7 +3616,8 @@ static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x,
void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
int blk_row, int blk_col, int plane, int block,
int plane_bsize, const ENTROPY_CONTEXT *a,
const ENTROPY_CONTEXT *l, RD_STATS *rd_stats, int fast) {
const ENTROPY_CONTEXT *l, RD_STATS *rd_stats, int fast,
TX_SIZE_RD_INFO *rd_info_array) {
const AV1_COMMON *const cm = &cpi->common;
MACROBLOCKD *xd = &x->e_mbd;
const struct macroblock_plane *const p = &x->plane[plane];
......@@ -3662,7 +3663,9 @@ void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
assert(tx_size < TX_SIZES_ALL);
int coeff_ctx = get_entropy_context(tx_size, a, l);
const int coeff_ctx = get_entropy_context(tx_size, a, l);
const int coeff_ctx_one_byte = combine_entropy_contexts(*a, *l);
const uint8_t cur_joint_ctx = (coeff_ctx << 2) + coeff_ctx_one_byte;
tmp = pixel_diff_dist(x, plane, diff, diff_stride, blk_row, blk_col,
plane_bsize, txm_bsize);
......@@ -3671,6 +3674,7 @@ void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
tmp = ROUND_POWER_OF_TWO(tmp, (xd->bd - 8) * 2);
#endif // CONFIG_HIGHBITDEPTH
rd_stats->sse += tmp << 4;
if (rd_stats->invalid_rate) {
......@@ -3680,6 +3684,21 @@ void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
return;
}
// Look up RD and terminate early in case when we've already processed exactly
// the same residual with exactly the same entropy context.
if (rd_info_array != NULL && rd_info_array[tx_type].valid &&
rd_info_array[tx_type].entropy_context == cur_joint_ctx &&
rd_info_array[tx_type].fast == fast) {
rd_stats->dist += rd_info_array[tx_type].dist;
rd_stats->rate += rd_info_array[tx_type].rate;
rd_stats->skip &= rd_info_array[tx_type].skip;
return;
}
int64_t cur_dist = 0;
int cur_rate = 0;
uint8_t cur_skip = 1;
// TODO(any): Use av1_dist_block to compute distortion
#if CONFIG_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
......@@ -3793,11 +3812,25 @@ void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
tmp = pixel_dist(cpi, x, plane, src, src_stride, rec_buffer, MAX_TX_SIZE,
blk_row, blk_col, plane_bsize, txm_bsize);
}
rd_stats->dist += tmp * 16;
cur_dist = tmp * 16;
txb_coeff_cost = av1_cost_coeffs(cpi, x, plane, blk_row, blk_col, block,
tx_size, scan_order, a, l, 0);
rd_stats->rate += txb_coeff_cost;
rd_stats->skip &= (eob == 0);
cur_rate = txb_coeff_cost;
cur_skip = (eob == 0);
// Save RD results for possible reuse in future.
if (rd_info_array != NULL) {
rd_info_array[tx_type].valid = 1;
rd_info_array[tx_type].entropy_context = cur_joint_ctx;
rd_info_array[tx_type].fast = fast;
rd_info_array[tx_type].dist = cur_dist;
rd_info_array[tx_type].rate = cur_rate;
rd_info_array[tx_type].skip = cur_skip;
}
rd_stats->dist += cur_dist;
rd_stats->rate += cur_rate;
rd_stats->skip &= cur_skip;
#if CONFIG_RD_DEBUG
av1_update_txb_coeff_cost(rd_stats, plane, tx_size, blk_row, blk_col,
......@@ -3812,7 +3845,8 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
RD_STATS *rd_stats, int64_t ref_best_rd,
int *is_cost_valid, int fast,
int tx_split_prune_flag) {
int tx_split_prune_flag,
TX_SIZE_RD_INFO_NODE *rd_info_node) {
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
struct macroblock_plane *const p = &x->plane[plane];
......@@ -3870,8 +3904,10 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
rd_stats->zero_rate = zero_blk_rate;
if (cpi->common.tx_mode == TX_MODE_SELECT || tx_size == TX_4X4) {
inter_tx_size[0][0] = tx_size;
av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
plane_bsize, pta, ptl, rd_stats, fast);
av1_tx_block_rd_b(
cpi, x, tx_size, blk_row, blk_col, plane, block, plane_bsize, pta, ptl,
rd_stats, fast,
rd_info_node != NULL ? rd_info_node->rd_info_array : NULL);
if (rd_stats->rate == INT_MAX) return;
if ((RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
......@@ -3932,17 +3968,19 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
ref_best_rd = AOMMIN(this_rd, ref_best_rd);
int blk_idx = 0;
for (int r = 0; r < tx_size_high_unit[tx_size]; r += bsh) {
for (int c = 0; c < tx_size_wide_unit[tx_size]; c += bsw) {
int offsetr = blk_row + r;
int offsetc = blk_col + c;
for (int c = 0; c < tx_size_wide_unit[tx_size]; c += bsw, ++blk_idx) {
const int offsetr = blk_row + r;
const int offsetc = blk_col + c;
if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
assert(blk_idx < 4);
select_tx_block(
cpi, x, offsetr, offsetc, plane, block, sub_txs, depth + 1,
plane_bsize, ta, tl, tx_above, tx_left, &this_rd_stats,
ref_best_rd - tmp_rd, &this_cost_valid, fast, 0,
(rd_info_node != NULL) ? rd_info_node->children[blk_idx] : NULL);
select_tx_block(cpi, x, offsetr, offsetc, plane, block, sub_txs,
depth + 1, plane_bsize, ta, tl, tx_above, tx_left,
&this_rd_stats, ref_best_rd - tmp_rd, &this_cost_valid,
fast, 0);
#if CONFIG_DIST_8X8
if (!x->using_dist_8x8)
#endif
......@@ -4100,7 +4138,8 @@ static int get_search_init_depth(int mi_width, int mi_height,
static void select_inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
RD_STATS *rd_stats, BLOCK_SIZE bsize,
int64_t ref_best_rd, int fast,
int tx_split_prune_flag) {
int tx_split_prune_flag,
TX_SIZE_RD_INFO_NODE *rd_info_tree) {
MACROBLOCKD *const xd = &x->e_mbd;
int is_cost_valid = 1;
int64_t this_rd = 0;
......@@ -4138,7 +4177,7 @@ static void select_inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
select_tx_block(cpi, x, idy, idx, 0, block, max_tx_size, init_depth,
plane_bsize, ctxa, ctxl, tx_above, tx_left,
&pn_rd_stats, ref_best_rd - this_rd, &is_cost_valid,
fast, tx_split_prune_flag);
fast, tx_split_prune_flag, rd_info_tree);
if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) {
av1_invalid_rd_stats(rd_stats);
return;
......@@ -4147,6 +4186,7 @@ static void select_inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
this_rd += AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist),
RDCOST(x->rdmult, 0, pn_rd_stats.sse));
block += step;
if (rd_info_tree != NULL) rd_info_tree += 1;
}
}
}
......@@ -4165,7 +4205,8 @@ static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x,
RD_STATS *rd_stats, BLOCK_SIZE bsize,
int mi_row, int mi_col,
int64_t ref_best_rd, TX_TYPE tx_type,
int tx_split_prune_flag) {
int tx_split_prune_flag,
TX_SIZE_RD_INFO_NODE *rd_info_tree) {
const int fast = cpi->sf.tx_size_search_method > USE_FULL_RD;
const AV1_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
......@@ -4191,7 +4232,7 @@ static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x,
mbmi->tx_type = tx_type;
select_inter_block_yrd(cpi, x, rd_stats, bsize, ref_best_rd, fast,
tx_split_prune_flag);
tx_split_prune_flag, rd_info_tree);
if (rd_stats->rate == INT_MAX) return INT64_MAX;
mbmi->min_tx_size = get_min_tx_size(mbmi->inter_tx_size[0][0]);
......@@ -4301,7 +4342,7 @@ static void tx_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
rd_stats->zero_rate = zero_blk_rate;
rd_stats->ref_rdcost = ref_best_rd;
av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
plane_bsize, ta, tl, rd_stats, fast);
plane_bsize, ta, tl, rd_stats, fast, NULL);
const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
......@@ -4470,6 +4511,129 @@ static void fetch_tx_rd_info(int n4, const TX_RD_INFO *const tx_rd_info,
*rd_stats = tx_rd_info->rd_stats;
}
static int find_tx_size_rd_info(TX_SIZE_RD_RECORD *cur_record,
const uint32_t hash) {
// Linear search through the circular buffer to find matching hash.
int index;
for (int i = cur_record->num - 1; i >= 0; i--) {
index = (cur_record->index_start + i) % TX_SIZE_RD_RECORD_BUFFER_LEN;
if (cur_record->hash_vals[index] == hash) return index;
}
// If not found - add new RD info into the buffer and return its index
if (cur_record->num < TX_SIZE_RD_RECORD_BUFFER_LEN) {
index = (cur_record->index_start + cur_record->num) %
TX_SIZE_RD_RECORD_BUFFER_LEN;
cur_record->num++;
} else {
index = cur_record->index_start;
cur_record->index_start =
(cur_record->index_start + 1) % TX_SIZE_RD_RECORD_BUFFER_LEN;
}
cur_record->hash_vals[index] = hash;
av1_zero(cur_record->tx_rd_info[index]);
return index;
}
// Go through all TX blocks that could be used in TX size search, compute
// residual hash values for them and find matching RD info that stores previous
// RD search results for these TX blocks. The idea is to prevent repeated
// rate/distortion computations that happen because of the combination of
// partition and TX size search. The resulting RD info records are returned in
// the form of a quadtree for easier access in actual TX size search.
static int find_tx_size_rd_records(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
int mi_col,
TX_SIZE_RD_INFO_NODE *dst_rd_info) {
#if CONFIG_TX64X64
TX_SIZE_RD_RECORD *rd_records_table[4] = { x->tx_size_rd_record_8X8,
x->tx_size_rd_record_16X16,
x->tx_size_rd_record_32X32,
x->tx_size_rd_record_64X64 };
#else
TX_SIZE_RD_RECORD *rd_records_table[3] = { x->tx_size_rd_record_8X8,
x->tx_size_rd_record_16X16,
x->tx_size_rd_record_32X32 };
#endif
const TX_SIZE max_square_tx_size = max_txsize_lookup[bsize];
// Hashing is performed only for square TX sizes larger than TX_4X4
if (max_square_tx_size < TX_8X8) return 0;
const int bw = block_size_wide[bsize];
const int bh = block_size_high[bsize];
const int diff_stride = bw;
const struct macroblock_plane *const p = &x->plane[0];
const int16_t *diff = &p->src_diff[0];
// Coordinates of the top-left corner of current block within the superblock
// measured in pixels:
const int mi_row_in_sb = (mi_row % MAX_MIB_SIZE) << MI_SIZE_LOG2;
const int mi_col_in_sb = (mi_col % MAX_MIB_SIZE) << MI_SIZE_LOG2;
int cur_rd_info_idx = 0;
int cur_tx_depth = 0;
uint8_t parent_idx_buf[MAX_SB_SQUARE] = { 0 };
int cur_tx_size = max_txsize_rect_lookup[bsize];
while (cur_tx_depth <= MAX_VARTX_DEPTH) {
const BLOCK_SIZE cur_tx_bsize = txsize_to_bsize[cur_tx_size];
const int cur_tx_bw = block_size_wide[cur_tx_bsize];
const int cur_tx_bh = block_size_high[cur_tx_bsize];
if (cur_tx_bw < 8 || cur_tx_bh < 8) break;
for (int row = 0; row < bh; row += cur_tx_bh) {
for (int col = 0; col < bw; col += cur_tx_bw) {
if (cur_tx_bw != cur_tx_bh) {
// Use dummy nodes for all rectangular transforms within the
// TX size search tree.
dst_rd_info[cur_rd_info_idx].rd_info_array = NULL;
} else {
// Get spatial location of this TX block within the superblock
// (measured in cur_tx_bsize units).
const int row_in_sb = (mi_row_in_sb + row) / cur_tx_bh;
const int col_in_sb = (mi_col_in_sb + col) / cur_tx_bw;
// Compute FNV-1a hash for this TX block.
uint32_t hash = 2166136261;
for (int i = 0; i < cur_tx_bh; i++) {
const int16_t *cur_diff_row = diff + (row + i) * diff_stride + col;
for (int j = 0; j < cur_tx_bw; j++) {
hash = hash ^ clip_pixel(cur_diff_row[j] + 128);
hash *= 16777619;
}
}
// Find corresponding RD info based on the hash value.
const int rd_record_idx =
row_in_sb * (MAX_MIB_SIZE >> (cur_tx_size + 1 - TX_8X8)) +
col_in_sb;
int idx = find_tx_size_rd_info(
&rd_records_table[cur_tx_size - TX_8X8][rd_record_idx], hash);
dst_rd_info[cur_rd_info_idx].rd_info_array =
rd_records_table[cur_tx_size - TX_8X8][rd_record_idx]
.tx_rd_info[idx];
}
// Update the output quadtree RD info structure.
av1_zero(dst_rd_info[cur_rd_info_idx].children);
if (cur_tx_depth > 0) {
const int y_odd = (row / cur_tx_bh) % 2;
const int x_odd = (col / cur_tx_bw) % 2;
const int child_idx = y_odd ? (x_odd ? 3 : 2) : (x_odd ? 1 : 0);
dst_rd_info[parent_idx_buf[row * bw + col]].children[child_idx] =
&dst_rd_info[cur_rd_info_idx];
}
for (int i = row; i < row + cur_tx_bh; ++i)
memset(parent_idx_buf + i * bw + col, cur_rd_info_idx, cur_tx_bw);
++cur_rd_info_idx;
}
}
cur_tx_size = sub_tx_size_map[cur_tx_size];
++cur_tx_depth;
}
return 1;
}
// Uses simple features on top of DCT coefficients to quickly predict
// whether optimal RD decision is to skip encoding the residual.
static int predict_skip_flag(const MACROBLOCK *x, BLOCK_SIZE bsize) {
......@@ -4638,6 +4802,15 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
return;
}
// Precompute residual hashes and find existing or add new RD records to
// store and reuse rate and distortion values to speed up TX size search.
TX_SIZE_RD_INFO_NODE matched_rd_info[16 + 64 + 256];
int found_rd_info = 0;
if (ref_best_rd != INT64_MAX && within_border) {
found_rd_info =
find_tx_size_rd_records(x, bsize, mi_row, mi_col, matched_rd_info);
}
if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE &&
!x->use_default_inter_tx_type && !xd->lossless[mbmi->segment_id]) {
prune = prune_tx(cpi, bsize, x, xd, tx_set_type,
......@@ -4691,7 +4864,8 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
if (tx_type != DCT_DCT) continue;
rd = select_tx_size_fix_type(cpi, x, &this_rd_stats, bsize, mi_row, mi_col,
ref_best_rd, tx_type, tx_split_prune_flag);
ref_best_rd, tx_type, tx_split_prune_flag,
found_rd_info ? matched_rd_info : NULL);
// If the current tx_type is not included in the tx_set for the smallest
// tx size found, then all vartx partitions were actually transformed with
// DCT_DCT and we should avoid picking it.
......@@ -4773,7 +4947,7 @@ static void tx_block_rd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
ENTROPY_CONTEXT *ta = above_ctx + blk_col;
ENTROPY_CONTEXT *tl = left_ctx + blk_row;
av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
plane_bsize, ta, tl, rd_stats, fast);
plane_bsize, ta, tl, rd_stats, fast, NULL);
av1_set_txb_context(x, plane, block, tx_size, ta, tl);
} else {
const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment