Commit d064cf03 authored by Jingning Han's avatar Jingning Han

Resolve extremely large stack alloc in rdopt

Move the large stack allocation from stack initialization to
dedicated mem space. This resolves the extremely large stack issue
when ext-partition, motion-var, and high bit-depth are all turned
on.

BUG=aomedia:415

Change-Id: I85b77bbc6429093fcb0152176d9e237087d6bbd8
parent c77799be
......@@ -147,6 +147,8 @@ struct macroblock {
#if CONFIG_MOTION_VAR
int32_t *wsrc_buf;
int32_t *mask_buf;
uint8_t *above_pred_buf;
uint8_t *left_pred_buf;
#endif // CONFIG_MOTION_VAR
#if CONFIG_PALETTE
......
......@@ -453,6 +453,20 @@ static void dealloc_compressor_data(AV1_COMP *cpi) {
aom_free(cpi->active_map.map);
cpi->active_map.map = NULL;
#if CONFIG_MOTION_VAR
aom_free(cpi->td.mb.above_pred_buf);
cpi->td.mb.above_pred_buf = NULL;
aom_free(cpi->td.mb.left_pred_buf);
cpi->td.mb.left_pred_buf = NULL;
aom_free(cpi->td.mb.wsrc_buf);
cpi->td.mb.wsrc_buf = NULL;
aom_free(cpi->td.mb.mask_buf);
cpi->td.mb.mask_buf = NULL;
#endif
// Free up-sampled reference buffers.
for (i = 0; i < (REF_FRAMES + 1); i++)
aom_free_frame_buffer(&cpi->upsampled_ref_bufs[i].buf);
......@@ -2239,6 +2253,31 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
}
#endif
#if CONFIG_MOTION_VAR
#if CONFIG_HIGHBITDEPTH
int buf_scaler = 2;
#else
int buf_scaler = 1;
#endif
CHECK_MEM_ERROR(
cm, cpi->td.mb.above_pred_buf,
(uint8_t *)aom_memalign(16, buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
sizeof(*cpi->td.mb.above_pred_buf)));
CHECK_MEM_ERROR(
cm, cpi->td.mb.left_pred_buf,
(uint8_t *)aom_memalign(16, buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
sizeof(*cpi->td.mb.left_pred_buf)));
CHECK_MEM_ERROR(cm, cpi->td.mb.wsrc_buf,
(int32_t *)aom_memalign(
16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.wsrc_buf)));
CHECK_MEM_ERROR(cm, cpi->td.mb.mask_buf,
(int32_t *)aom_memalign(
16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.mask_buf)));
#endif
init_upsampled_ref_frame_bufs(cpi);
av1_set_speed_features_framesize_independent(cpi);
......@@ -2530,6 +2569,12 @@ void av1_remove_compressor(AV1_COMP *cpi) {
if (cpi->common.allow_screen_content_tools)
aom_free(thread_data->td->palette_buffer);
#endif // CONFIG_PALETTE
#if CONFIG_MOTION_VAR
aom_free(thread_data->td->above_pred_buf);
aom_free(thread_data->td->left_pred_buf);
aom_free(thread_data->td->wsrc_buf);
aom_free(thread_data->td->mask_buf);
#endif // CONFIG_MOTION_VAR
aom_free(thread_data->td->counts);
av1_free_pc_tree(thread_data->td);
aom_free(thread_data->td);
......
......@@ -321,6 +321,12 @@ typedef struct ThreadData {
PICK_MODE_CONTEXT *leaf_tree;
PC_TREE *pc_tree;
PC_TREE *pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
#if CONFIG_MOTION_VAR
int32_t *wsrc_buf;
int32_t *mask_buf;
uint8_t *above_pred_buf;
uint8_t *left_pred_buf;
#endif
#if CONFIG_PALETTE
PALETTE_BUFFER *palette_buffer;
......
......@@ -93,6 +93,29 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) {
thread_data->td->pc_tree = NULL;
av1_setup_pc_tree(cm, thread_data->td);
#if CONFIG_MOTION_VAR
#if CONFIG_HIGHBITDEPTH
int buf_scaler = 2;
#else
int buf_scaler = 1;
#endif
CHECK_MEM_ERROR(cm, thread_data->td->above_pred_buf,
(uint8_t *)aom_memalign(
16, buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
sizeof(*thread_data->td->above_pred_buf)));
CHECK_MEM_ERROR(cm, thread_data->td->left_pred_buf,
(uint8_t *)aom_memalign(
16, buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
sizeof(*thread_data->td->left_pred_buf)));
CHECK_MEM_ERROR(
cm, thread_data->td->wsrc_buf,
(int32_t *)aom_memalign(
16, MAX_SB_SQUARE * sizeof(*thread_data->td->wsrc_buf)));
CHECK_MEM_ERROR(
cm, thread_data->td->mask_buf,
(int32_t *)aom_memalign(
16, MAX_SB_SQUARE * sizeof(*thread_data->td->mask_buf)));
#endif
// Allocate frame counters in thread data.
CHECK_MEM_ERROR(cm, thread_data->td->counts,
aom_calloc(1, sizeof(*thread_data->td->counts)));
......@@ -132,6 +155,12 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) {
if (thread_data->td != &cpi->td) {
thread_data->td->mb = cpi->td.mb;
thread_data->td->rd_counts = cpi->td.rd_counts;
#if CONFIG_MOTION_VAR
thread_data->td->mb.above_pred_buf = thread_data->td->above_pred_buf;
thread_data->td->mb.left_pred_buf = thread_data->td->left_pred_buf;
thread_data->td->mb.wsrc_buf = thread_data->td->wsrc_buf;
thread_data->td->mb.mask_buf = thread_data->td->mask_buf;
#endif
}
if (thread_data->td->counts != &cpi->common.counts) {
memcpy(thread_data->td->counts, &cpi->common.counts,
......
......@@ -9948,15 +9948,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
const MODE_INFO *left_mi = xd->left_mi;
#endif // CONFIG_PALETTE
#if CONFIG_MOTION_VAR
#if CONFIG_HIGHBITDEPTH
DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
DECLARE_ALIGNED(16, uint8_t, tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
#else
DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_SB_SQUARE]);
DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_SB_SQUARE]);
#endif // CONFIG_HIGHBITDEPTH
DECLARE_ALIGNED(16, int32_t, weighted_src_buf[MAX_SB_SQUARE]);
DECLARE_ALIGNED(16, int32_t, mask2d_buf[MAX_SB_SQUARE]);
int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
......@@ -9965,22 +9956,24 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
#if CONFIG_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
int len = sizeof(uint16_t);
args.above_pred_buf[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
args.above_pred_buf[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * len);
args.above_pred_buf[0] = CONVERT_TO_BYTEPTR(x->above_pred_buf);
args.above_pred_buf[1] =
CONVERT_TO_BYTEPTR(x->above_pred_buf + MAX_SB_SQUARE * len);
args.above_pred_buf[2] =
CONVERT_TO_BYTEPTR(tmp_buf1 + 2 * MAX_SB_SQUARE * len);
args.left_pred_buf[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
args.left_pred_buf[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len);
CONVERT_TO_BYTEPTR(x->above_pred_buf + 2 * MAX_SB_SQUARE * len);
args.left_pred_buf[0] = CONVERT_TO_BYTEPTR(x->left_pred_buf);
args.left_pred_buf[1] =
CONVERT_TO_BYTEPTR(x->left_pred_buf + MAX_SB_SQUARE * len);
args.left_pred_buf[2] =
CONVERT_TO_BYTEPTR(tmp_buf2 + 2 * MAX_SB_SQUARE * len);
CONVERT_TO_BYTEPTR(x->left_pred_buf + 2 * MAX_SB_SQUARE * len);
} else {
#endif // CONFIG_HIGHBITDEPTH
args.above_pred_buf[0] = tmp_buf1;
args.above_pred_buf[1] = tmp_buf1 + MAX_SB_SQUARE;
args.above_pred_buf[2] = tmp_buf1 + 2 * MAX_SB_SQUARE;
args.left_pred_buf[0] = tmp_buf2;
args.left_pred_buf[1] = tmp_buf2 + MAX_SB_SQUARE;
args.left_pred_buf[2] = tmp_buf2 + 2 * MAX_SB_SQUARE;
args.above_pred_buf[0] = x->above_pred_buf;
args.above_pred_buf[1] = x->above_pred_buf + MAX_SB_SQUARE;
args.above_pred_buf[2] = x->above_pred_buf + 2 * MAX_SB_SQUARE;
args.left_pred_buf[0] = x->left_pred_buf;
args.left_pred_buf[1] = x->left_pred_buf + MAX_SB_SQUARE;
args.left_pred_buf[2] = x->left_pred_buf + 2 * MAX_SB_SQUARE;
#if CONFIG_HIGHBITDEPTH
}
#endif // CONFIG_HIGHBITDEPTH
......@@ -10088,8 +10081,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
dst_height2, args.left_pred_stride);
av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
mi_col);
x->mask_buf = mask2d_buf;
x->wsrc_buf = weighted_src_buf;
calc_target_weighted_pred(cm, x, xd, mi_row, mi_col, args.above_pred_buf[0],
args.above_pred_stride[0], args.left_pred_buf[0],
args.left_pred_stride[0]);
......
......@@ -507,9 +507,6 @@ post_process_cmdline() {
soft_enable palette_throughput
soft_enable tempmv_signaling
# Workaround features currently incompatible with highbitdepth
enabled ext_partition && disable_feature highbitdepth
# Fix up experiment dependencies
enabled pvq && enable_feature ec_adapt
enabled pvq && disable_feature chroma_sub8x8
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment