Commit 1ddf2314 authored by Hui Su's avatar Hui Su

Encoder speedup: buffer TX RD search results

Encoder may do multiple rounds of transform RD search on the same
residue signals. Save the RD search results so that encoder can
fetch the results and terminate early if the residue signals have
been seen before.

The hash functions are ported from the hash_me experiment.

Test results show the encoder speed is increased by 10% on
average (ranging from 0 to 30%) with all default experiments on.

Change-Id: I47dd63549f67cb43d3d700f6a76ce0992b1ccd0d
parent 46f30c7f
......@@ -123,6 +123,8 @@ set(AOM_AV1_ENCODER_SOURCES
"${AOM_ROOT}/av1/encoder/extend.h"
"${AOM_ROOT}/av1/encoder/firstpass.c"
"${AOM_ROOT}/av1/encoder/firstpass.h"
"${AOM_ROOT}/av1/encoder/hash.c"
"${AOM_ROOT}/av1/encoder/hash.h"
"${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.c"
"${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.h"
"${AOM_ROOT}/av1/encoder/lookahead.c"
......
......@@ -107,6 +107,8 @@ AV1_CX_SRCS-yes += encoder/temporal_filter.c
AV1_CX_SRCS-yes += encoder/temporal_filter.h
AV1_CX_SRCS-yes += encoder/mbgraph.c
AV1_CX_SRCS-yes += encoder/mbgraph.h
AV1_CX_SRCS-yes += encoder/hash.c
AV1_CX_SRCS-yes += encoder/hash.h
ifeq ($(CONFIG_HASH_ME),yes)
AV1_CX_SRCS-yes += ../third_party/vector/vector.h
AV1_CX_SRCS-yes += ../third_party/vector/vector.c
......
......@@ -18,6 +18,7 @@
#include "av1/encoder/encint.h"
#endif
#include "av1/common/mvref_common.h"
#include "av1/encoder/hash.h"
#if CONFIG_DIST_8X8
#include "aom/aomcx.h"
#endif
......@@ -115,10 +116,36 @@ typedef struct {
float kmeans_data_buf[2 * MAX_SB_SQUARE];
} PALETTE_BUFFER;
typedef struct {
TX_TYPE tx_type;
TX_SIZE tx_size;
#if CONFIG_VAR_TX
TX_SIZE min_tx_size;
TX_SIZE inter_tx_size[MAX_MIB_SIZE][MAX_MIB_SIZE];
uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
#endif // CONFIG_VAR_TX
#if CONFIG_TXK_SEL
TX_TYPE txk_type[MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
#endif // CONFIG_TXK_SEL
RD_STATS rd_stats;
uint32_t hash_value;
} TX_RD_INFO;
#define RD_RECORD_BUFFER_LEN 8
typedef struct {
TX_RD_INFO tx_rd_info[RD_RECORD_BUFFER_LEN]; // Circular buffer.
int index_start;
int num;
CRC_CALCULATOR crc_calculator; // Hash function.
} TX_RD_RECORD;
typedef struct macroblock MACROBLOCK;
struct macroblock {
struct macroblock_plane plane[MAX_MB_PLANE];
// Save the transform RD search info.
TX_RD_RECORD tx_rd_record;
MACROBLOCKD e_mbd;
MB_MODE_INFO_EXT *mbmi_ext;
int skip_block;
......
......@@ -4654,6 +4654,7 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
}
}
x->tx_rd_record.num = x->tx_rd_record.index_start = 0;
av1_zero(x->pred_mv);
pc_root->index = 0;
......@@ -5024,6 +5025,8 @@ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
av1_fill_coeff_costs(&td->mb, td->mb.e_mbd.tile_ctx);
#endif
av1_crc_calculator_init(&td->mb.tx_rd_record.crc_calculator, 24, 0x5D6DCB);
for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
mi_row += cm->mib_size) {
encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
......
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include "av1/encoder/hash.h"
static void crc_calculator_process_data(CRC_CALCULATOR *p_crc_calculator,
uint8_t *pData, uint32_t dataLength) {
for (uint32_t i = 0; i < dataLength; i++) {
const uint8_t index =
(p_crc_calculator->remainder >> (p_crc_calculator->bits - 8)) ^
pData[i];
p_crc_calculator->remainder <<= 8;
p_crc_calculator->remainder ^= p_crc_calculator->table[index];
}
}
void crc_calculator_reset(CRC_CALCULATOR *p_crc_calculator) {
p_crc_calculator->remainder = 0;
}
static uint32_t crc_calculator_get_crc(CRC_CALCULATOR *p_crc_calculator) {
return p_crc_calculator->remainder & p_crc_calculator->final_result_mask;
}
static void crc_calculator_init_table(CRC_CALCULATOR *p_crc_calculator) {
const uint32_t high_bit = 1 << (p_crc_calculator->bits - 1);
const uint32_t byte_high_bit = 1 << (8 - 1);
for (uint32_t value = 0; value < 256; value++) {
uint32_t remainder = 0;
for (uint8_t mask = byte_high_bit; mask != 0; mask >>= 1) {
if (value & mask) {
remainder ^= high_bit;
}
if (remainder & high_bit) {
remainder <<= 1;
remainder ^= p_crc_calculator->trunc_poly;
} else {
remainder <<= 1;
}
}
p_crc_calculator->table[value] = remainder;
}
}
void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits,
uint32_t truncPoly) {
p_crc_calculator->remainder = 0;
p_crc_calculator->bits = bits;
p_crc_calculator->trunc_poly = truncPoly;
p_crc_calculator->final_result_mask = (1 << bits) - 1;
crc_calculator_init_table(p_crc_calculator);
}
uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p,
int length) {
crc_calculator_reset(p_crc_calculator);
crc_calculator_process_data(p_crc_calculator, p, length);
return crc_calculator_get_crc(p_crc_calculator);
}
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AV1_ENCODER_HASH_H_
#define AV1_ENCODER_HASH_H_
#include "./aom_config.h"
#include "aom/aom_integer.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct _crc_calculator {
uint32_t remainder;
uint32_t trunc_poly;
uint32_t bits;
uint32_t table[256];
uint32_t final_result_mask;
} CRC_CALCULATOR;
// Initialize the crc calculator. It must be executed at least once before
// calling av1_get_crc_value().
void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits,
uint32_t truncPoly);
uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p,
int length);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // AV1_ENCODER_HASH_H_
#include <assert.h>
#include "av1/encoder/hash.h"
#include "av1/encoder/hash_motion.h"
#include "./av1_rtcd.h"
typedef struct _crc_calculator {
uint32_t remainder;
uint32_t trunc_poly;
uint32_t bits;
uint32_t table[256];
uint32_t final_result_mask;
} crc_calculator;
static void crc_calculator_process_data(crc_calculator *p_crc_calculator,
uint8_t *pData, uint32_t dataLength) {
for (uint32_t i = 0; i < dataLength; i++) {
const uint8_t index =
(p_crc_calculator->remainder >> (p_crc_calculator->bits - 8)) ^
pData[i];
p_crc_calculator->remainder <<= 8;
p_crc_calculator->remainder ^= p_crc_calculator->table[index];
}
}
static void crc_calculator_reset(crc_calculator *p_crc_calculator) {
p_crc_calculator->remainder = 0;
}
static uint32_t crc_calculator_get_crc(crc_calculator *p_crc_calculator) {
return p_crc_calculator->remainder & p_crc_calculator->final_result_mask;
}
static void crc_calculator_init_table(crc_calculator *p_crc_calculator) {
const uint32_t high_bit = 1 << (p_crc_calculator->bits - 1);
const uint32_t byte_high_bit = 1 << (8 - 1);
for (uint32_t value = 0; value < 256; value++) {
uint32_t remainder = 0;
for (uint8_t mask = byte_high_bit; mask != 0; mask >>= 1) {
if (value & mask) {
remainder ^= high_bit;
}
if (remainder & high_bit) {
remainder <<= 1;
remainder ^= p_crc_calculator->trunc_poly;
} else {
remainder <<= 1;
}
}
p_crc_calculator->table[value] = remainder;
}
}
static void crc_calculator_init(crc_calculator *p_crc_calculator, uint32_t bits,
uint32_t truncPoly) {
p_crc_calculator->remainder = 0;
p_crc_calculator->bits = bits;
p_crc_calculator->trunc_poly = truncPoly;
p_crc_calculator->final_result_mask = (1 << bits) - 1;
crc_calculator_init_table(p_crc_calculator);
}
static const int crc_bits = 16;
static const int block_size_bits = 2;
static crc_calculator crc_calculator1;
static crc_calculator crc_calculator2;
static CRC_CALCULATOR crc_calculator1;
static CRC_CALCULATOR crc_calculator2;
static int g_crc_initialized = 0;
static void hash_table_clear_all(hash_table *p_hash_table) {
......@@ -81,18 +23,6 @@ static void hash_table_clear_all(hash_table *p_hash_table) {
}
}
static uint32_t get_crc_value1(uint8_t *p, int length) {
crc_calculator_reset(&crc_calculator1);
crc_calculator_process_data(&crc_calculator1, p, length);
return crc_calculator_get_crc(&crc_calculator1);
}
static uint32_t get_crc_value2(uint8_t *p, int length) {
crc_calculator_reset(&crc_calculator2);
crc_calculator_process_data(&crc_calculator2, p, length);
return crc_calculator_get_crc(&crc_calculator2);
}
// TODO(youzhou@microsoft.com): is higher than 8 bits screen content supported?
// If yes, fix this function
static void get_pixels_in_1D_char_array_by_block_2x2(uint8_t *y_src, int stride,
......@@ -138,8 +68,8 @@ static int hash_block_size_to_index(int block_size) {
void av1_hash_table_init(hash_table *p_hash_table) {
if (g_crc_initialized == 0) {
crc_calculator_init(&crc_calculator1, 24, 0x5D6DCB);
crc_calculator_init(&crc_calculator2, 24, 0x864CFB);
av1_crc_calculator_init(&crc_calculator1, 24, 0x5D6DCB);
av1_crc_calculator_init(&crc_calculator2, 24, 0x864CFB);
g_crc_initialized = 1;
}
p_hash_table->p_lookup_table = NULL;
......@@ -226,8 +156,10 @@ void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture,
pic_block_same_info[0][pos] = is_block_2x2_row_same_value(p);
pic_block_same_info[1][pos] = is_block_2x2_col_same_value(p);
pic_block_hash[0][pos] = get_crc_value1(p, length * sizeof(p[0]));
pic_block_hash[1][pos] = get_crc_value2(p, length * sizeof(p[0]));
pic_block_hash[0][pos] =
av1_get_crc_value(&crc_calculator1, p, length * sizeof(p[0]));
pic_block_hash[1][pos] =
av1_get_crc_value(&crc_calculator2, p, length * sizeof(p[0]));
pos++;
}
......@@ -258,13 +190,15 @@ void av1_generate_block_hash_value(const YV12_BUFFER_CONFIG *picture,
p[1] = src_pic_block_hash[0][pos + src_size];
p[2] = src_pic_block_hash[0][pos + src_size * pic_width];
p[3] = src_pic_block_hash[0][pos + src_size * pic_width + src_size];
dst_pic_block_hash[0][pos] = get_crc_value1((uint8_t *)p, length);
dst_pic_block_hash[0][pos] =
av1_get_crc_value(&crc_calculator1, (uint8_t *)p, length);
p[0] = src_pic_block_hash[1][pos];
p[1] = src_pic_block_hash[1][pos + src_size];
p[2] = src_pic_block_hash[1][pos + src_size * pic_width];
p[3] = src_pic_block_hash[1][pos + src_size * pic_width + src_size];
dst_pic_block_hash[1][pos] = get_crc_value2((uint8_t *)p, length);
dst_pic_block_hash[1][pos] =
av1_get_crc_value(&crc_calculator2, (uint8_t *)p, length);
dst_pic_block_same_info[0][pos] =
src_pic_block_same_info[0][pos] &&
......@@ -391,10 +325,10 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
get_pixels_in_1D_char_array_by_block_2x2(y_src + y_pos * stride + x_pos,
stride, pixel_to_hash);
hash_value_buffer[0][0][pos] =
get_crc_value1(pixel_to_hash, sizeof(pixel_to_hash));
hash_value_buffer[1][0][pos] =
get_crc_value2(pixel_to_hash, sizeof(pixel_to_hash));
hash_value_buffer[0][0][pos] = av1_get_crc_value(
&crc_calculator1, pixel_to_hash, sizeof(pixel_to_hash));
hash_value_buffer[1][0][pos] = av1_get_crc_value(
&crc_calculator2, pixel_to_hash, sizeof(pixel_to_hash));
}
}
......@@ -421,8 +355,8 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
to_hash[3] =
hash_value_buffer[0][src_idx][srcPos + src_sub_block_in_width + 1];
hash_value_buffer[0][dst_idx][dst_pos] =
get_crc_value1((uint8_t *)to_hash, sizeof(to_hash));
hash_value_buffer[0][dst_idx][dst_pos] = av1_get_crc_value(
&crc_calculator1, (uint8_t *)to_hash, sizeof(to_hash));
to_hash[0] = hash_value_buffer[1][src_idx][srcPos];
to_hash[1] = hash_value_buffer[1][src_idx][srcPos + 1];
......@@ -430,8 +364,8 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
hash_value_buffer[1][src_idx][srcPos + src_sub_block_in_width];
to_hash[3] =
hash_value_buffer[1][src_idx][srcPos + src_sub_block_in_width + 1];
hash_value_buffer[1][dst_idx][dst_pos] =
get_crc_value2((uint8_t *)to_hash, sizeof(to_hash));
hash_value_buffer[1][dst_idx][dst_pos] = av1_get_crc_value(
&crc_calculator2, (uint8_t *)to_hash, sizeof(to_hash));
dst_pos++;
}
}
......
......@@ -5148,6 +5148,67 @@ static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x,
return rd;
}
static uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) {
const int rows = block_size_high[bsize];
const int cols = block_size_wide[bsize];
const int diff_stride = cols;
const struct macroblock_plane *const p = &x->plane[0];
const int16_t *diff = &p->src_diff[0];
uint8_t hash_data[MAX_SB_SQUARE];
for (int r = 0; r < rows; ++r) {
for (int c = 0; c < cols; ++c) {
hash_data[cols * r + c] = clip_pixel(diff[c] + 128);
}
diff += diff_stride;
}
return (av1_get_crc_value(&x->tx_rd_record.crc_calculator, hash_data,
rows * cols)
<< 7) +
bsize;
}
static void save_tx_rd_info(int n4, uint32_t hash, const MACROBLOCK *const x,
const RD_STATS *const rd_stats,
TX_RD_INFO *const tx_rd_info) {
const MACROBLOCKD *const xd = &x->e_mbd;
const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
tx_rd_info->hash_value = hash;
tx_rd_info->tx_type = mbmi->tx_type;
tx_rd_info->tx_size = mbmi->tx_size;
#if CONFIG_VAR_TX
tx_rd_info->min_tx_size = mbmi->min_tx_size;
memcpy(tx_rd_info->blk_skip, x->blk_skip[0],
sizeof(tx_rd_info->blk_skip[0]) * n4);
for (int idy = 0; idy < xd->n8_h; ++idy)
for (int idx = 0; idx < xd->n8_w; ++idx)
tx_rd_info->inter_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx];
#endif // CONFIG_VAR_TX
#if CONFIG_TXK_SEL
av1_copy(tx_rd_info->txk_type, mbmi->txk_type);
#endif // CONFIG_TXK_SEL
tx_rd_info->rd_stats = *rd_stats;
}
static void fetch_tx_rd_info(int n4, const TX_RD_INFO *const tx_rd_info,
RD_STATS *const rd_stats, MACROBLOCK *const x) {
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
mbmi->tx_type = tx_rd_info->tx_type;
mbmi->tx_size = tx_rd_info->tx_size;
#if CONFIG_VAR_TX
mbmi->min_tx_size = tx_rd_info->min_tx_size;
memcpy(x->blk_skip[0], tx_rd_info->blk_skip,
sizeof(tx_rd_info->blk_skip[0]) * n4);
for (int idy = 0; idy < xd->n8_h; ++idy)
for (int idx = 0; idx < xd->n8_w; ++idx)
mbmi->inter_tx_size[idy][idx] = tx_rd_info->inter_tx_size[idy][idx];
#endif // CONFIG_VAR_TX
#if CONFIG_TXK_SEL
av1_copy(mbmi->txk_type, tx_rd_info->txk_type);
#endif // CONFIG_TXK_SEL
*rd_stats = tx_rd_info->rd_stats;
}
static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
RD_STATS *rd_stats, BLOCK_SIZE bsize,
int64_t ref_best_rd) {
......@@ -5196,6 +5257,22 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
for (idx = 0; idx < count32; ++idx)
av1_invalid_rd_stats(&rd_stats_stack[idx]);
const uint32_t hash = get_block_residue_hash(x, bsize);
TX_RD_RECORD *tx_rd_record = &x->tx_rd_record;
if (ref_best_rd != INT64_MAX) {
for (int i = 0; i < tx_rd_record->num; ++i) {
const int index = (tx_rd_record->index_start + i) % RD_RECORD_BUFFER_LEN;
// If there is a match in the tx_rd_record, fetch the RD decision and
// terminate early.
if (tx_rd_record->tx_rd_info[index].hash_value == hash) {
TX_RD_INFO *tx_rd_info = &tx_rd_record->tx_rd_info[index];
fetch_tx_rd_info(n4, tx_rd_info, rd_stats, x);
return;
}
}
}
for (tx_type = txk_start; tx_type < txk_end; ++tx_type) {
RD_STATS this_rd_stats;
av1_init_rd_stats(&this_rd_stats);
......@@ -5254,6 +5331,19 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
mbmi->tx_size = best_tx;
mbmi->min_tx_size = best_min_tx_size;
memcpy(x->blk_skip[0], best_blk_skip, sizeof(best_blk_skip[0]) * n4);
// Save the RD search results into tx_rd_record.
int index;
if (tx_rd_record->num < RD_RECORD_BUFFER_LEN) {
index =
(tx_rd_record->index_start + tx_rd_record->num) % RD_RECORD_BUFFER_LEN;
++tx_rd_record->num;
} else {
index = tx_rd_record->index_start;
tx_rd_record->index_start =
(tx_rd_record->index_start + 1) % RD_RECORD_BUFFER_LEN;
}
save_tx_rd_info(n4, hash, x, rd_stats, &tx_rd_record->tx_rd_info[index]);
}
static void tx_block_rd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
......@@ -11586,6 +11676,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
#if CONFIG_VAR_TX
if (cm->tx_mode == TX_MODE_SELECT || xd->lossless[mbmi->segment_id]) {
select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
assert(rd_stats_y.rate != INT_MAX);
} else {
int idx, idy;
super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment