Commit 20c1edf6 authored by hkuang's avatar hkuang

Refactor decode_tiles and loopfilter code.

The current decode_tiles decodes the frame one tile by one tile
and then loopfilter the whole frame or use another worker thread to
do loopfiltering.

|------|------|------|------|
|Tile1-|Tile2-|Tile3-|Tile4-|
|------|------|------|------|

For example, if a tile video has one row and four cols, decode_tiles
will decode the Tile1, then Tile2, then Tile3, then Tile4.
And during decode each tile, decode_tile will decode row by row in
each tile.

For frame parallel decoding, decode_tiles will decode video in row order
across the tiles. So the order will be:
"Decode 1st row of Tile1" -> "Decode 1st row of Tile2"
-> "Decode 1st row of Tile3" -> "Decode 1st row of Tile4"
-> "Decode 2nd row of Tile1" -> "Decode 2nd row of Tile2"
-> "Decode 2nd row of Tile3" -> "Decode 2nd row of Tile4"-> "loopfilter 1st row"

Change-Id: I2211f9adc6d142fbf411d491031203cb8a6dbf6b
parent 7af34402
...@@ -21,13 +21,21 @@ static int get_tile_offset(int idx, int mis, int log2) { ...@@ -21,13 +21,21 @@ static int get_tile_offset(int idx, int mis, int log2) {
return MIN(offset, mis); return MIN(offset, mis);
} }
void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm, int row, int col) { void vp9_tile_set_row(TileInfo *tile, const VP9_COMMON *cm, int row) {
tile->mi_row_start = get_tile_offset(row, cm->mi_rows, cm->log2_tile_rows); tile->mi_row_start = get_tile_offset(row, cm->mi_rows, cm->log2_tile_rows);
tile->mi_row_end = get_tile_offset(row + 1, cm->mi_rows, cm->log2_tile_rows); tile->mi_row_end = get_tile_offset(row + 1, cm->mi_rows, cm->log2_tile_rows);
}
void vp9_tile_set_col(TileInfo *tile, const VP9_COMMON *cm, int col) {
tile->mi_col_start = get_tile_offset(col, cm->mi_cols, cm->log2_tile_cols); tile->mi_col_start = get_tile_offset(col, cm->mi_cols, cm->log2_tile_cols);
tile->mi_col_end = get_tile_offset(col + 1, cm->mi_cols, cm->log2_tile_cols); tile->mi_col_end = get_tile_offset(col + 1, cm->mi_cols, cm->log2_tile_cols);
} }
void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm, int row, int col) {
vp9_tile_set_row(tile, cm, row);
vp9_tile_set_col(tile, cm, col);
}
void vp9_get_tile_n_bits(int mi_cols, void vp9_get_tile_n_bits(int mi_cols,
int *min_log2_tile_cols, int *max_log2_tile_cols) { int *min_log2_tile_cols, int *max_log2_tile_cols) {
const int sb_cols = mi_cols_aligned_to_sb(mi_cols) >> MI_BLOCK_SIZE_LOG2; const int sb_cols = mi_cols_aligned_to_sb(mi_cols) >> MI_BLOCK_SIZE_LOG2;
......
...@@ -27,6 +27,9 @@ typedef struct TileInfo { ...@@ -27,6 +27,9 @@ typedef struct TileInfo {
void vp9_tile_init(TileInfo *tile, const struct VP9Common *cm, void vp9_tile_init(TileInfo *tile, const struct VP9Common *cm,
int row, int col); int row, int col);
void vp9_tile_set_row(TileInfo *tile, const struct VP9Common *cm, int row);
void vp9_tile_set_col(TileInfo *tile, const struct VP9Common *cm, int col);
void vp9_get_tile_n_bits(int mi_cols, void vp9_get_tile_n_bits(int mi_cols,
int *min_log2_tile_cols, int *max_log2_tile_cols); int *min_log2_tile_cols, int *max_log2_tile_cols);
......
...@@ -675,64 +675,6 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm, ...@@ -675,64 +675,6 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
setup_display_size(cm, rb); setup_display_size(cm, rb);
} }
static void decode_tile(VP9Decoder *pbi, const TileInfo *const tile,
int do_loopfilter_inline, vp9_reader *r) {
const int num_threads = pbi->max_threads;
VP9_COMMON *const cm = &pbi->common;
int mi_row, mi_col;
MACROBLOCKD *xd = &pbi->mb;
if (do_loopfilter_inline) {
LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
lf_data->frame_buffer = get_frame_new_buffer(cm);
lf_data->cm = cm;
vp9_copy(lf_data->planes, pbi->mb.plane);
lf_data->stop = 0;
lf_data->y_only = 0;
vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
}
for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
mi_row += MI_BLOCK_SIZE) {
// For a SB there are 2 left contexts, each pertaining to a MB row within
vp9_zero(xd->left_context);
vp9_zero(xd->left_seg_context);
for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
mi_col += MI_BLOCK_SIZE) {
decode_partition(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64);
}
if (do_loopfilter_inline) {
const int lf_start = mi_row - MI_BLOCK_SIZE;
LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
// delay the loopfilter by 1 macroblock row.
if (lf_start < 0) continue;
// decoding has completed: finish up the loop filter in this thread.
if (mi_row + MI_BLOCK_SIZE >= tile->mi_row_end) continue;
vp9_worker_sync(&pbi->lf_worker);
lf_data->start = lf_start;
lf_data->stop = mi_row;
if (num_threads > 1) {
vp9_worker_launch(&pbi->lf_worker);
} else {
vp9_worker_execute(&pbi->lf_worker);
}
}
}
if (do_loopfilter_inline) {
LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
vp9_worker_sync(&pbi->lf_worker);
lf_data->start = lf_data->stop;
lf_data->stop = cm->mi_rows;
vp9_worker_execute(&pbi->lf_worker);
}
}
static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
int min_log2_tile_cols, max_log2_tile_cols, max_ones; int min_log2_tile_cols, max_log2_tile_cols, max_ones;
vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols); vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
...@@ -811,16 +753,35 @@ static void get_tile_buffers(VP9Decoder *pbi, ...@@ -811,16 +753,35 @@ static void get_tile_buffers(VP9Decoder *pbi,
static const uint8_t *decode_tiles(VP9Decoder *pbi, static const uint8_t *decode_tiles(VP9Decoder *pbi,
const uint8_t *data, const uint8_t *data,
const uint8_t *data_end, const uint8_t *data_end) {
int do_loopfilter_inline) {
VP9_COMMON *const cm = &pbi->common; VP9_COMMON *const cm = &pbi->common;
const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols); const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols);
const int tile_cols = 1 << cm->log2_tile_cols; const int tile_cols = 1 << cm->log2_tile_cols;
const int tile_rows = 1 << cm->log2_tile_rows; const int tile_rows = 1 << cm->log2_tile_rows;
TileBuffer tile_buffers[4][1 << 6]; TileBuffer tile_buffers[4][1 << 6];
int tile_row, tile_col; int tile_row, tile_col;
const uint8_t *end = NULL; int mi_row, mi_col;
vp9_reader r; TileData *tile_data = NULL;
if (cm->lf.filter_level && pbi->lf_worker.data1 == NULL) {
CHECK_MEM_ERROR(cm, pbi->lf_worker.data1,
vpx_memalign(32, sizeof(LFWorkerData)));
pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker;
if (pbi->max_threads > 1 && !vp9_worker_reset(&pbi->lf_worker)) {
vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
"Loop filter thread creation failed");
}
}
if (cm->lf.filter_level) {
LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
lf_data->frame_buffer = get_frame_new_buffer(cm);
lf_data->cm = cm;
vp9_copy(lf_data->planes, pbi->mb.plane);
lf_data->stop = 0;
lf_data->y_only = 0;
vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
}
assert(tile_rows <= 4); assert(tile_rows <= 4);
assert(tile_cols <= (1 << 6)); assert(tile_cols <= (1 << 6));
...@@ -835,26 +796,88 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, ...@@ -835,26 +796,88 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers); get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers);
// Decode tiles using data from tile_buffers if (pbi->tile_data == NULL ||
(tile_cols * tile_rows) != pbi->total_tiles) {
vpx_free(pbi->tile_data);
CHECK_MEM_ERROR(
cm,
pbi->tile_data,
vpx_malloc(tile_cols * tile_rows * (sizeof(*pbi->tile_data))));
pbi->total_tiles = tile_rows * tile_cols;
}
// Load all tile information into tile_data.
for (tile_row = 0; tile_row < tile_rows; ++tile_row) { for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
for (tile_col = 0; tile_col < tile_cols; ++tile_col) { for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
const int col = pbi->inv_tile_order ? tile_cols - tile_col - 1 : tile_col;
const int last_tile = tile_row == tile_rows - 1 &&
col == tile_cols - 1;
const TileBuffer *const buf = &tile_buffers[tile_row][col];
TileInfo tile; TileInfo tile;
const TileBuffer *const buf = &tile_buffers[tile_row][tile_col];
tile_data = pbi->tile_data + tile_cols * tile_row + tile_col;
tile_data->cm = cm;
tile_data->xd = pbi->mb;
tile_data->xd.corrupted = 0;
vp9_tile_init(&tile, tile_data->cm, tile_row, tile_col);
setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
&tile_data->bit_reader, pbi->decrypt_cb,
pbi->decrypt_state);
init_macroblockd(cm, &tile_data->xd);
vp9_zero(tile_data->xd.dqcoeff);
}
}
vp9_tile_init(&tile, cm, tile_row, col); for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
setup_token_decoder(buf->data, data_end, buf->size, &cm->error, &r, TileInfo tile;
pbi->decrypt_cb, pbi->decrypt_state); vp9_tile_set_row(&tile, cm, tile_row);
decode_tile(pbi, &tile, do_loopfilter_inline, &r); for (mi_row = tile.mi_row_start; mi_row < tile.mi_row_end;
mi_row += MI_BLOCK_SIZE) {
if (last_tile) for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
end = vp9_reader_find_end(&r); const int col = pbi->inv_tile_order ?
tile_cols - tile_col - 1 : tile_col;
tile_data = pbi->tile_data + tile_cols * tile_row + col;
vp9_tile_set_col(&tile, tile_data->cm, col);
vp9_zero(tile_data->xd.left_context);
vp9_zero(tile_data->xd.left_seg_context);
for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
mi_col += MI_BLOCK_SIZE) {
decode_partition(tile_data->cm, &tile_data->xd, &tile, mi_row, mi_col,
&tile_data->bit_reader, BLOCK_64X64);
}
}
// Loopfilter one row.
if (cm->lf.filter_level) {
const int lf_start = mi_row - MI_BLOCK_SIZE;
LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
// delay the loopfilter by 1 macroblock row.
if (lf_start < 0) continue;
// decoding has completed: finish up the loop filter in this thread.
if (mi_row + MI_BLOCK_SIZE >= cm->mi_rows) continue;
vp9_worker_sync(&pbi->lf_worker);
lf_data->start = lf_start;
lf_data->stop = mi_row;
if (pbi->max_threads > 1) {
vp9_worker_launch(&pbi->lf_worker);
} else {
vp9_worker_execute(&pbi->lf_worker);
}
}
} }
} }
return end; // Loopfilter remaining rows in the frame.
if (cm->lf.filter_level) {
LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
vp9_worker_sync(&pbi->lf_worker);
lf_data->start = lf_data->stop;
lf_data->stop = cm->mi_rows;
vp9_worker_execute(&pbi->lf_worker);
}
// Get last tile data.
tile_data = pbi->tile_data + tile_cols * tile_rows - 1;
return vp9_reader_find_end(&tile_data->bit_reader);
} }
static int tile_worker_hook(void *arg1, void *arg2) { static int tile_worker_hook(void *arg1, void *arg2) {
...@@ -1279,7 +1302,6 @@ static struct vp9_read_bit_buffer* init_read_bit_buffer( ...@@ -1279,7 +1302,6 @@ static struct vp9_read_bit_buffer* init_read_bit_buffer(
const uint8_t *data, const uint8_t *data,
const uint8_t *data_end, const uint8_t *data_end,
uint8_t *clear_data /* buffer size MAX_VP9_HEADER_SIZE */) { uint8_t *clear_data /* buffer size MAX_VP9_HEADER_SIZE */) {
vp9_zero(*rb);
rb->bit_offset = 0; rb->bit_offset = 0;
rb->error_handler = error_handler; rb->error_handler = error_handler;
rb->error_handler_data = &pbi->common; rb->error_handler_data = &pbi->common;
...@@ -1300,7 +1322,7 @@ int vp9_decode_frame(VP9Decoder *pbi, ...@@ -1300,7 +1322,7 @@ int vp9_decode_frame(VP9Decoder *pbi,
const uint8_t **p_data_end) { const uint8_t **p_data_end) {
VP9_COMMON *const cm = &pbi->common; VP9_COMMON *const cm = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb; MACROBLOCKD *const xd = &pbi->mb;
struct vp9_read_bit_buffer rb; struct vp9_read_bit_buffer rb = { 0 };
uint8_t clear_data[MAX_VP9_HEADER_SIZE]; uint8_t clear_data[MAX_VP9_HEADER_SIZE];
const size_t first_partition_size = read_uncompressed_header(pbi, const size_t first_partition_size = read_uncompressed_header(pbi,
init_read_bit_buffer(pbi, &rb, data, data_end, clear_data)); init_read_bit_buffer(pbi, &rb, data, data_end, clear_data));
...@@ -1308,8 +1330,6 @@ int vp9_decode_frame(VP9Decoder *pbi, ...@@ -1308,8 +1330,6 @@ int vp9_decode_frame(VP9Decoder *pbi,
const int tile_rows = 1 << cm->log2_tile_rows; const int tile_rows = 1 << cm->log2_tile_rows;
const int tile_cols = 1 << cm->log2_tile_cols; const int tile_cols = 1 << cm->log2_tile_cols;
YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm); YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);
const int do_loopfilter_inline = tile_rows == 1 && tile_cols == 1 &&
cm->lf.filter_level;
xd->cur_buf = new_fb; xd->cur_buf = new_fb;
if (!first_partition_size) { if (!first_partition_size) {
...@@ -1352,19 +1372,7 @@ int vp9_decode_frame(VP9Decoder *pbi, ...@@ -1352,19 +1372,7 @@ int vp9_decode_frame(VP9Decoder *pbi,
// to do parallel loopfiltering. // to do parallel loopfiltering.
vp9_loop_filter_frame_mt(new_fb, pbi, cm, cm->lf.filter_level, 0); vp9_loop_filter_frame_mt(new_fb, pbi, cm, cm->lf.filter_level, 0);
} else { } else {
if (do_loopfilter_inline && pbi->lf_worker.data1 == NULL) { *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end);
CHECK_MEM_ERROR(cm, pbi->lf_worker.data1,
vpx_memalign(32, sizeof(LFWorkerData)));
pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker;
if (pbi->max_threads > 1 && !vp9_worker_reset(&pbi->lf_worker)) {
vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
"Loop filter thread creation failed");
}
}
*p_data_end = decode_tiles(pbi, data + first_partition_size, data_end,
do_loopfilter_inline);
if (!do_loopfilter_inline)
vp9_loop_filter_frame(new_fb, cm, &pbi->mb, cm->lf.filter_level, 0, 0);
} }
new_fb->corrupted |= xd->corrupted; new_fb->corrupted |= xd->corrupted;
......
...@@ -90,6 +90,7 @@ void vp9_decoder_remove(VP9Decoder *pbi) { ...@@ -90,6 +90,7 @@ void vp9_decoder_remove(VP9Decoder *pbi) {
vp9_remove_common(cm); vp9_remove_common(cm);
vp9_worker_end(&pbi->lf_worker); vp9_worker_end(&pbi->lf_worker);
vpx_free(pbi->lf_worker.data1); vpx_free(pbi->lf_worker.data1);
vpx_free(pbi->tile_data);
for (i = 0; i < pbi->num_tile_workers; ++i) { for (i = 0; i < pbi->num_tile_workers; ++i) {
VP9Worker *const worker = &pbi->tile_workers[i]; VP9Worker *const worker = &pbi->tile_workers[i];
vp9_worker_end(worker); vp9_worker_end(worker);
......
...@@ -27,6 +27,13 @@ ...@@ -27,6 +27,13 @@
extern "C" { extern "C" {
#endif #endif
// TODO(hkuang): combine this with TileWorkerData.
typedef struct TileData {
VP9_COMMON *cm;
vp9_reader bit_reader;
DECLARE_ALIGNED(16, MACROBLOCKD, xd);
} TileData;
typedef struct VP9Decoder { typedef struct VP9Decoder {
DECLARE_ALIGNED(16, MACROBLOCKD, mb); DECLARE_ALIGNED(16, MACROBLOCKD, mb);
...@@ -40,10 +47,12 @@ typedef struct VP9Decoder { ...@@ -40,10 +47,12 @@ typedef struct VP9Decoder {
int decoded_key_frame; int decoded_key_frame;
VP9Worker lf_worker; VP9Worker lf_worker;
VP9Worker *tile_workers; VP9Worker *tile_workers;
int num_tile_workers; int num_tile_workers;
TileData *tile_data;
int total_tiles;
VP9LfSync lf_row_sync; VP9LfSync lf_row_sync;
vpx_decrypt_cb decrypt_cb; vpx_decrypt_cb decrypt_cb;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment