Commit 903801f1 authored by Yunqing Wang's avatar Yunqing Wang

vp9 decoder: row-based multi-threaded loopfilter

Implemented parallel loopfiltering, which uses existing tile-
decoding threads. Each thread works on one row, and when that row
is loopfiltered, it moves to next unattended row. To ensure the
correct filtering order, threads are synchronized and one
superblock is filtered only if the superblocks it depends on are
filtered already.

To reduce synchronization overhead and speed up the decoder, we use
nsync > 1 for high resolution.

Performance tests:
1. on desktop:
8-tile 4k video using 8 threads, speedup: 70% - 80%
4-tile HD video using 4 threads, speedup: ~35%
2. on mobile device(Nexus 7):
4-tile 1080p video using 4 threads, speedup: 18% - 25%
4-tile 1080p video using 2 threads, speedup: 10% - 15%

Change-Id: If54b4a11960dd706c22d5ad145ad94156031f36a
parent e78c174e
...@@ -16,26 +16,6 @@ ...@@ -16,26 +16,6 @@
#include "vp9/common/vp9_seg_common.h" #include "vp9/common/vp9_seg_common.h"
// This structure holds bit masks for all 8x8 blocks in a 64x64 region.
// Each 1 bit represents a position in which we want to apply the loop filter.
// Left_ entries refer to whether we apply a filter on the border to the
// left of the block. Above_ entries refer to whether or not to apply a
// filter on the above border. Int_ entries refer to whether or not to
// apply borders on the 4x4 edges within the 8x8 block that each bit
// represents.
// Since each transform is accompanied by a potentially different type of
// loop filter there is a different entry in the array for each transform size.
typedef struct {
uint64_t left_y[TX_SIZES];
uint64_t above_y[TX_SIZES];
uint64_t int_4x4_y;
uint16_t left_uv[TX_SIZES];
uint16_t above_uv[TX_SIZES];
uint16_t int_4x4_uv;
uint8_t lfl_y[64];
uint8_t lfl_uv[16];
} LOOP_FILTER_MASK;
// 64 bit masks for left transform size. Each 1 represents a position where // 64 bit masks for left transform size. Each 1 represents a position where
// we should apply a loop filter across the left border of an 8x8 block // we should apply a loop filter across the left border of an 8x8 block
// boundary. // boundary.
...@@ -638,9 +618,9 @@ static void build_y_mask(const loop_filter_info_n *const lfi_n, ...@@ -638,9 +618,9 @@ static void build_y_mask(const loop_filter_info_n *const lfi_n,
// This function sets up the bit masks for the entire 64x64 region represented // This function sets up the bit masks for the entire 64x64 region represented
// by mi_row, mi_col. // by mi_row, mi_col.
// TODO(JBB): This function only works for yv12. // TODO(JBB): This function only works for yv12.
static void setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
MODE_INFO **mi_8x8, const int mode_info_stride, MODE_INFO **mi_8x8, const int mode_info_stride,
LOOP_FILTER_MASK *lfm) { LOOP_FILTER_MASK *lfm) {
int idx_32, idx_16, idx_8; int idx_32, idx_16, idx_8;
const loop_filter_info_n *const lfi_n = &cm->lf_info; const loop_filter_info_n *const lfi_n = &cm->lf_info;
MODE_INFO **mip = mi_8x8; MODE_INFO **mip = mi_8x8;
...@@ -1069,10 +1049,10 @@ static void filter_block_plane_non420(VP9_COMMON *cm, ...@@ -1069,10 +1049,10 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
} }
#endif #endif
static void filter_block_plane(VP9_COMMON *const cm, void vp9_filter_block_plane(VP9_COMMON *const cm,
struct macroblockd_plane *const plane, struct macroblockd_plane *const plane,
int mi_row, int mi_row,
LOOP_FILTER_MASK *lfm) { LOOP_FILTER_MASK *lfm) {
struct buf_2d *const dst = &plane->dst; struct buf_2d *const dst = &plane->dst;
uint8_t* const dst0 = dst->buf; uint8_t* const dst0 = dst->buf;
int r, c; int r, c;
...@@ -1244,14 +1224,14 @@ void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer, ...@@ -1244,14 +1224,14 @@ void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
#if CONFIG_NON420 #if CONFIG_NON420
if (use_420) if (use_420)
#endif #endif
setup_mask(cm, mi_row, mi_col, mi_8x8 + mi_col, cm->mode_info_stride, vp9_setup_mask(cm, mi_row, mi_col, mi_8x8 + mi_col,
&lfm); cm->mode_info_stride, &lfm);
for (plane = 0; plane < num_planes; ++plane) { for (plane = 0; plane < num_planes; ++plane) {
#if CONFIG_NON420 #if CONFIG_NON420
if (use_420) if (use_420)
#endif #endif
filter_block_plane(cm, &xd->plane[plane], mi_row, &lfm); vp9_filter_block_plane(cm, &xd->plane[plane], mi_row, &lfm);
#if CONFIG_NON420 #if CONFIG_NON420
else else
filter_block_plane_non420(cm, &xd->plane[plane], mi_8x8 + mi_col, filter_block_plane_non420(cm, &xd->plane[plane], mi_8x8 + mi_col,
......
...@@ -60,9 +60,42 @@ typedef struct { ...@@ -60,9 +60,42 @@ typedef struct {
uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS]; uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS];
} loop_filter_info_n; } loop_filter_info_n;
// This structure holds bit masks for all 8x8 blocks in a 64x64 region.
// Each 1 bit represents a position in which we want to apply the loop filter.
// Left_ entries refer to whether we apply a filter on the border to the
// left of the block. Above_ entries refer to whether or not to apply a
// filter on the above border. Int_ entries refer to whether or not to
// apply borders on the 4x4 edges within the 8x8 block that each bit
// represents.
// Since each transform is accompanied by a potentially different type of
// loop filter there is a different entry in the array for each transform size.
typedef struct {
uint64_t left_y[TX_SIZES];
uint64_t above_y[TX_SIZES];
uint64_t int_4x4_y;
uint16_t left_uv[TX_SIZES];
uint16_t above_uv[TX_SIZES];
uint16_t int_4x4_uv;
uint8_t lfl_y[64];
uint8_t lfl_uv[16];
} LOOP_FILTER_MASK;
/* assorted loopfilter functions which get used elsewhere */ /* assorted loopfilter functions which get used elsewhere */
struct VP9Common; struct VP9Common;
struct macroblockd; struct macroblockd;
struct VP9LfSyncData;
// This function sets up the bit masks for the entire 64x64 region represented
// by mi_row, mi_col.
void vp9_setup_mask(struct VP9Common *const cm,
const int mi_row, const int mi_col,
MODE_INFO **mi_8x8, const int mode_info_stride,
LOOP_FILTER_MASK *lfm);
void vp9_filter_block_plane(struct VP9Common *const cm,
struct macroblockd_plane *const plane,
int mi_row,
LOOP_FILTER_MASK *lfm);
void vp9_loop_filter_init(struct VP9Common *cm); void vp9_loop_filter_init(struct VP9Common *cm);
...@@ -90,6 +123,9 @@ typedef struct LoopFilterWorkerData { ...@@ -90,6 +123,9 @@ typedef struct LoopFilterWorkerData {
int start; int start;
int stop; int stop;
int y_only; int y_only;
struct VP9LfSyncData *lf_sync;
int num_lf_workers;
} LFWorkerData; } LFWorkerData;
// Operates on the rows described by LFWorkerData passed as 'arg1'. // Operates on the rows described by LFWorkerData passed as 'arg1'.
......
...@@ -33,18 +33,12 @@ ...@@ -33,18 +33,12 @@
#include "vp9/decoder/vp9_detokenize.h" #include "vp9/decoder/vp9_detokenize.h"
#include "vp9/decoder/vp9_decodemv.h" #include "vp9/decoder/vp9_decodemv.h"
#include "vp9/decoder/vp9_dsubexp.h" #include "vp9/decoder/vp9_dsubexp.h"
#include "vp9/decoder/vp9_dthread.h"
#include "vp9/decoder/vp9_onyxd_int.h" #include "vp9/decoder/vp9_onyxd_int.h"
#include "vp9/decoder/vp9_read_bit_buffer.h" #include "vp9/decoder/vp9_read_bit_buffer.h"
#include "vp9/decoder/vp9_reader.h" #include "vp9/decoder/vp9_reader.h"
#include "vp9/decoder/vp9_thread.h" #include "vp9/decoder/vp9_thread.h"
typedef struct TileWorkerData {
VP9_COMMON *cm;
vp9_reader bit_reader;
DECLARE_ALIGNED(16, MACROBLOCKD, xd);
DECLARE_ALIGNED(16, int16_t, dqcoeff[MAX_MB_PLANE][64 * 64]);
} TileWorkerData;
static int read_be32(const uint8_t *p) { static int read_be32(const uint8_t *p) {
return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]; return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
} }
...@@ -982,7 +976,6 @@ static const uint8_t *decode_tiles_mt(VP9D_COMP *pbi, const uint8_t *data) { ...@@ -982,7 +976,6 @@ static const uint8_t *decode_tiles_mt(VP9D_COMP *pbi, const uint8_t *data) {
++pbi->num_tile_workers; ++pbi->num_tile_workers;
vp9_worker_init(worker); vp9_worker_init(worker);
worker->hook = (VP9WorkerHook)tile_worker_hook;
CHECK_MEM_ERROR(cm, worker->data1, CHECK_MEM_ERROR(cm, worker->data1,
vpx_memalign(32, sizeof(TileWorkerData))); vpx_memalign(32, sizeof(TileWorkerData)));
CHECK_MEM_ERROR(cm, worker->data2, vpx_malloc(sizeof(TileInfo))); CHECK_MEM_ERROR(cm, worker->data2, vpx_malloc(sizeof(TileInfo)));
...@@ -993,6 +986,11 @@ static const uint8_t *decode_tiles_mt(VP9D_COMP *pbi, const uint8_t *data) { ...@@ -993,6 +986,11 @@ static const uint8_t *decode_tiles_mt(VP9D_COMP *pbi, const uint8_t *data) {
} }
} }
// Reset tile decoding hook
for (n = 0; n < pbi->num_tile_workers; ++n) {
pbi->tile_workers[n].hook = (VP9WorkerHook)tile_worker_hook;
}
// Note: this memset assumes above_context[0], [1] and [2] // Note: this memset assumes above_context[0], [1] and [2]
// are allocated as part of the same buffer. // are allocated as part of the same buffer.
vpx_memset(pbi->above_context[0], 0, vpx_memset(pbi->above_context[0], 0,
...@@ -1393,9 +1391,6 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) { ...@@ -1393,9 +1391,6 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
*p_data_end = decode_tiles(pbi, data + first_partition_size); *p_data_end = decode_tiles(pbi, data + first_partition_size);
} }
cm->last_width = cm->width;
cm->last_height = cm->height;
new_fb->corrupted |= xd->corrupted; new_fb->corrupted |= xd->corrupted;
if (!pbi->decoded_key_frame) { if (!pbi->decoded_key_frame) {
......
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vpx_config.h"
#include "vp9/common/vp9_reconinter.h"
#include "vp9/decoder/vp9_dthread.h"
#include "vp9/decoder/vp9_onyxd_int.h"
#include "vpx_mem/vpx_mem.h"
#if CONFIG_MULTITHREAD
static INLINE void mutex_lock(pthread_mutex_t *const mutex) {
const int kMaxTryLocks = 4000;
int locked = 0;
int i;
for (i = 0; i < kMaxTryLocks; ++i) {
if (!pthread_mutex_trylock(mutex)) {
locked = 1;
break;
}
}
if (!locked)
pthread_mutex_lock(mutex);
}
#endif // CONFIG_MULTITHREAD
static INLINE void sync_read(VP9LfSync *const lf_sync, int r, int c) {
#if CONFIG_MULTITHREAD
const int nsync = lf_sync->sync_range;
if (r && !(c & (nsync - 1))) {
mutex_lock(&lf_sync->mutex_[r - 1]);
while (c > lf_sync->cur_sb_col[r - 1] - nsync) {
pthread_cond_wait(&lf_sync->cond_[r - 1],
&lf_sync->mutex_[r - 1]);
}
pthread_mutex_unlock(&lf_sync->mutex_[r - 1]);
}
#else
(void)lf_sync;
(void)r;
(void)c;
#endif // CONFIG_MULTITHREAD
}
static INLINE void sync_write(VP9LfSync *const lf_sync, int r, int c,
const int sb_cols) {
#if CONFIG_MULTITHREAD
const int nsync = lf_sync->sync_range;
int cur;
// Only signal when there are enough filtered SB for next row to run.
int sig = 1;
if (c < sb_cols - 1) {
cur = c;
if (c % nsync)
sig = 0;
} else {
cur = sb_cols + nsync;
}
if (sig) {
mutex_lock(&lf_sync->mutex_[r]);
lf_sync->cur_sb_col[r] = cur;
pthread_cond_signal(&lf_sync->cond_[r]);
pthread_mutex_unlock(&lf_sync->mutex_[r]);
}
#else
(void)lf_sync;
(void)r;
(void)c;
(void)sb_cols;
#endif // CONFIG_MULTITHREAD
}
// Implement row loopfiltering for each thread.
static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer,
VP9_COMMON *const cm, MACROBLOCKD *const xd,
int start, int stop, int y_only,
VP9LfSync *const lf_sync, int num_lf_workers) {
const int num_planes = y_only ? 1 : MAX_MB_PLANE;
int r, c; // SB row and col
LOOP_FILTER_MASK lfm;
const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;
for (r = start; r < stop; r += num_lf_workers) {
const int mi_row = r << MI_BLOCK_SIZE_LOG2;
MODE_INFO **mi_8x8 = cm->mi_grid_visible + mi_row * cm->mode_info_stride;
for (c = 0; c < sb_cols; ++c) {
const int mi_col = c << MI_BLOCK_SIZE_LOG2;
int plane;
sync_read(lf_sync, r, c);
setup_dst_planes(xd, frame_buffer, mi_row, mi_col);
vp9_setup_mask(cm, mi_row, mi_col, mi_8x8 + mi_col, cm->mode_info_stride,
&lfm);
for (plane = 0; plane < num_planes; ++plane) {
vp9_filter_block_plane(cm, &xd->plane[plane], mi_row, &lfm);
}
sync_write(lf_sync, r, c, sb_cols);
}
}
}
// Row-based multi-threaded loopfilter hook
static int loop_filter_row_worker(void *arg1, void *arg2) {
TileWorkerData *const tile_data = (TileWorkerData*)arg1;
LFWorkerData *const lf_data = &tile_data->lfdata;
loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, &lf_data->xd,
lf_data->start, lf_data->stop, lf_data->y_only,
lf_data->lf_sync, lf_data->num_lf_workers);
return 1;
}
// VP9 decoder: Implement multi-threaded loopfilter that uses the tile
// threads.
void vp9_loop_filter_frame_mt(VP9D_COMP *pbi,
VP9_COMMON *cm,
MACROBLOCKD *xd,
int frame_filter_level,
int y_only, int partial) {
// Number of superblock rows and cols
const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
int i;
// Allocate memory used in thread synchronization.
// This always needs to be done even if frame_filter_level is 0.
if (!cm->current_video_frame || cm->last_height != cm->height) {
VP9LfSync *const lf_sync = &pbi->lf_row_sync;
if (cm->last_height != cm->height) {
const int aligned_last_height =
ALIGN_POWER_OF_TWO(cm->last_height, MI_SIZE_LOG2);
const int last_sb_rows =
mi_cols_aligned_to_sb(aligned_last_height >> MI_SIZE_LOG2) >>
MI_BLOCK_SIZE_LOG2;
vp9_loop_filter_dealloc(lf_sync, last_sb_rows);
}
vp9_loop_filter_alloc(cm, lf_sync, sb_rows, cm->width);
}
if (!frame_filter_level) return;
vp9_loop_filter_frame_init(cm, frame_filter_level);
// Initialize cur_sb_col to -1 for all SB rows.
vpx_memset(pbi->lf_row_sync.cur_sb_col, -1,
sizeof(*pbi->lf_row_sync.cur_sb_col) * sb_rows);
// Set up loopfilter thread data.
for (i = 0; i < pbi->num_tile_workers; ++i) {
VP9Worker *const worker = &pbi->tile_workers[i];
TileWorkerData *const tile_data = (TileWorkerData*)worker->data1;
LFWorkerData *const lf_data = &tile_data->lfdata;
worker->hook = (VP9WorkerHook)loop_filter_row_worker;
// Loopfilter data
lf_data->frame_buffer = get_frame_new_buffer(cm);
lf_data->cm = cm;
lf_data->xd = pbi->mb;
lf_data->start = i;
lf_data->stop = sb_rows;
lf_data->y_only = y_only; // always do all planes in decoder
lf_data->lf_sync = &pbi->lf_row_sync;
lf_data->num_lf_workers = pbi->num_tile_workers;
// Start loopfiltering
if (i == pbi->num_tile_workers - 1) {
vp9_worker_execute(worker);
} else {
vp9_worker_launch(worker);
}
}
// Wait till all rows are finished
for (i = 0; i < pbi->num_tile_workers; ++i) {
vp9_worker_sync(&pbi->tile_workers[i]);
}
}
// Set up nsync by width.
static int get_sync_range(int width) {
// nsync numbers are picked by testing. For example, for 4k
// video, using 4 gives best performance.
if (width < 640)
return 1;
else if (width <= 1280)
return 2;
else if (width <= 4096)
return 4;
else
return 8;
}
// Allocate memory for lf row synchronization
void vp9_loop_filter_alloc(VP9_COMMON *cm, VP9LfSync *lf_sync, int rows,
int width) {
#if CONFIG_MULTITHREAD
int i;
CHECK_MEM_ERROR(cm, lf_sync->mutex_,
vpx_malloc(sizeof(*lf_sync->mutex_) * rows));
CHECK_MEM_ERROR(cm, lf_sync->cond_,
vpx_malloc(sizeof(*lf_sync->cond_) * rows));
for (i = 0; i < rows; ++i) {
pthread_mutex_init(&lf_sync->mutex_[i], NULL);
pthread_cond_init(&lf_sync->cond_[i], NULL);
}
#endif // CONFIG_MULTITHREAD
CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,
vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));
// Set up nsync.
lf_sync->sync_range = get_sync_range(width);
}
// Deallocate lf synchronization related mutex and data
void vp9_loop_filter_dealloc(VP9LfSync *lf_sync, int rows) {
#if CONFIG_MULTITHREAD
if (lf_sync != NULL) {
int i;
for (i = 0; i < rows; ++i) {
pthread_mutex_destroy(&lf_sync->mutex_[i]);
pthread_cond_destroy(&lf_sync->cond_[i]);
}
vpx_free(lf_sync->mutex_);
vpx_free(lf_sync->cond_);
vpx_free(lf_sync->cur_sb_col);
}
#else
(void)rows;
if (lf_sync != NULL)
vpx_free(lf_sync->cur_sb_col);
#endif // CONFIG_MULTITHREAD
}
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP9_DECODER_VP9_DTHREAD_H_
#define VP9_DECODER_VP9_DTHREAD_H_
#include "./vpx_config.h"
#include "vp9/common/vp9_loopfilter.h"
#include "vp9/decoder/vp9_reader.h"
#include "vp9/decoder/vp9_thread.h"
struct macroblockd;
struct VP9Common;
struct VP9Decompressor;
typedef struct TileWorkerData {
struct VP9Common *cm;
vp9_reader bit_reader;
DECLARE_ALIGNED(16, struct macroblockd, xd);
DECLARE_ALIGNED(16, int16_t, dqcoeff[MAX_MB_PLANE][64 * 64]);
// Row-based parallel loopfilter data
LFWorkerData lfdata;
} TileWorkerData;
// Loopfilter row synchronization
typedef struct VP9LfSyncData {
#if CONFIG_MULTITHREAD
pthread_mutex_t *mutex_;
pthread_cond_t *cond_;
#endif
// Allocate memory to store the loop-filtered superblock index in each row.
int *cur_sb_col;
// The optimal sync_range for different resolution and platform should be
// determined by testing. Currently, it is chosen to be a power-of-2 number.
int sync_range;
} VP9LfSync;
// Allocate memory for loopfilter row synchronization.
void vp9_loop_filter_alloc(struct VP9Common *cm, struct VP9LfSyncData *lf_sync,
int rows, int width);
// Deallocate loopfilter synchronization related mutex and data.
void vp9_loop_filter_dealloc(struct VP9LfSyncData *lf_sync, int rows);
// Multi-threaded loopfilter that uses the tile threads.
void vp9_loop_filter_frame_mt(struct VP9Decompressor *pbi,
struct VP9Common *cm,
struct macroblockd *xd,
int frame_filter_level,
int y_only, int partial);
#endif // VP9_DECODER_VP9_DTHREAD_H_
...@@ -27,6 +27,7 @@ ...@@ -27,6 +27,7 @@
#include "vpx_ports/vpx_timer.h" #include "vpx_ports/vpx_timer.h"
#include "vp9/decoder/vp9_decodeframe.h" #include "vp9/decoder/vp9_decodeframe.h"
#include "vp9/decoder/vp9_detokenize.h" #include "vp9/decoder/vp9_detokenize.h"
#include "vp9/decoder/vp9_dthread.h"
#include "./vpx_scale_rtcd.h" #include "./vpx_scale_rtcd.h"
#define WRITE_RECON_BUFFER 0 #define WRITE_RECON_BUFFER 0
...@@ -177,6 +178,16 @@ void vp9_remove_decompressor(VP9D_PTR ptr) { ...@@ -177,6 +178,16 @@ void vp9_remove_decompressor(VP9D_PTR ptr) {
vpx_free(worker->data2); vpx_free(worker->data2);
} }
vpx_free(pbi->tile_workers); vpx_free(pbi->tile_workers);
if (pbi->num_tile_workers) {
VP9_COMMON *const cm = &pbi->common;
const int sb_rows =
mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
VP9LfSync *const lf_sync = &pbi->lf_row_sync;
vp9_loop_filter_dealloc(lf_sync, sb_rows);
}
vpx_free(pbi->mi_streams); vpx_free(pbi->mi_streams);
vpx_free(pbi->above_context[0]); vpx_free(pbi->above_context[0]);
vpx_free(pbi->above_seg_context); vpx_free(pbi->above_seg_context);
...@@ -370,7 +381,13 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, ...@@ -370,7 +381,13 @@ int vp9_receive_compressed_data(VP9D_PTR ptr,
#endif #endif
if (!pbi->do_loopfilter_inline) { if (!pbi->do_loopfilter_inline) {
vp9_loop_filter_frame(cm, &pbi->mb, pbi->common.lf.filter_level, 0, 0); // If multiple threads are used to decode tiles, then we use those threads
// to do parallel loopfiltering.
if (pbi->num_tile_workers) {
vp9_loop_filter_frame_mt(pbi, cm, &pbi->mb, cm->lf.filter_level, 0, 0);
} else {
vp9_loop_filter_frame(cm, &pbi->mb, cm->lf.filter_level, 0, 0);
}
} }
#if WRITE_RECON_BUFFER == 2 #if WRITE_RECON_BUFFER == 2
...@@ -390,6 +407,9 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, ...@@ -390,6 +407,9 @@ int vp9_receive_compressed_data(VP9D_PTR ptr,
vp9_clear_system_state(); vp9_clear_system_state();
cm->last_width = cm->width;
cm->last_height = cm->height;
if (!cm->show_existing_frame) if (!cm->show_existing_frame)
cm->last_show_frame = cm->show_frame; cm->last_show_frame = cm->show_frame;
if (cm->show_frame) { if (cm->show_frame) {
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "./vpx_config.h" #include "./vpx_config.h"
#include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_onyxc_int.h"
#include "vp9/decoder/vp9_dthread.h"
#include "vp9/decoder/vp9_onyxd.h" #include "vp9/decoder/vp9_onyxd.h"
#include "vp9/decoder/vp9_thread.h" #include "vp9/decoder/vp9_thread.h"
...@@ -49,6 +50,8 @@ typedef struct VP9Decompressor { ...@@ -49,6 +50,8 @@ typedef struct VP9Decompressor {
VP9Worker *tile_workers; VP9Worker *tile_workers;
int num_tile_workers; int num_tile_workers;
VP9LfSync lf_row_sync;
/* Each tile column has its own MODE_INFO stream. This array indexes them by /* Each tile column has its own MODE_INFO stream. This array indexes them by
tile column index. */ tile column index. */