Commit 01900edc authored by James Zern's avatar James Zern Committed by Gerrit Code Review
Browse files

Merge changes I8a9c9019,Ic7b2faa3,I44d42a50,I3f3a3924,I10747b32,I31b49c9e

* changes:
  add vp9_loop_filter_data_reset
  move LFWorkerData allocation to VP9LfSync
  vp9_loop_filter_frame_mt: remove pbi dependency
  vp9_loop_filter_frame_mt: pass planes directly
  vp9_loop_filter_frame_mt: pass VP9LfSync directly
  vp9: store TileWorkerData allocations separately
parents fe2fd37b 01483677
......@@ -1625,6 +1625,17 @@ void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame,
y_only);
}
void vp9_loop_filter_data_reset(
LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer,
struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]) {
lf_data->frame_buffer = frame_buffer;
lf_data->cm = cm;
lf_data->start = 0;
lf_data->stop = 0;
lf_data->y_only = 0;
vpx_memcpy(lf_data->planes, planes, sizeof(lf_data->planes));
}
int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused) {
(void)unused;
vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
......
......@@ -124,11 +124,12 @@ typedef struct LoopFilterWorkerData {
int start;
int stop;
int y_only;
struct VP9LfSyncData *lf_sync;
int num_lf_workers;
} LFWorkerData;
void vp9_loop_filter_data_reset(
LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer,
struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]);
// Operates on the rows described by 'lf_data'.
int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused);
#ifdef __cplusplus
......
......@@ -902,11 +902,8 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
// Be sure to sync as we might be resuming after a failed frame decode.
winterface->sync(&pbi->lf_worker);
lf_data->frame_buffer = get_frame_new_buffer(cm);
lf_data->cm = cm;
vp9_copy(lf_data->planes, pbi->mb.plane);
lf_data->stop = 0;
lf_data->y_only = 0;
vp9_loop_filter_data_reset(lf_data, get_frame_new_buffer(cm), cm,
pbi->mb.plane);
vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
}
......@@ -1065,14 +1062,19 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
// use num_threads - 1 workers.
CHECK_MEM_ERROR(cm, pbi->tile_workers,
vpx_malloc(num_threads * sizeof(*pbi->tile_workers)));
// Ensure tile data offsets will be properly aligned. This may fail on
// platforms without DECLARE_ALIGNED().
assert((sizeof(*pbi->tile_worker_data) % 16) == 0);
CHECK_MEM_ERROR(cm, pbi->tile_worker_data,
vpx_memalign(32, num_threads *
sizeof(*pbi->tile_worker_data)));
CHECK_MEM_ERROR(cm, pbi->tile_worker_info,
vpx_malloc(num_threads * sizeof(*pbi->tile_worker_info)));
for (i = 0; i < num_threads; ++i) {
VP9Worker *const worker = &pbi->tile_workers[i];
++pbi->num_tile_workers;
winterface->init(worker);
CHECK_MEM_ERROR(cm, worker->data1,
vpx_memalign(32, sizeof(TileWorkerData)));
CHECK_MEM_ERROR(cm, worker->data2, vpx_malloc(sizeof(TileInfo)));
if (i < num_threads - 1 && !winterface->reset(worker)) {
vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
"Tile decoder thread creation failed");
......@@ -1082,8 +1084,11 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
// Reset tile decoding hook
for (n = 0; n < num_workers; ++n) {
winterface->sync(&pbi->tile_workers[n]);
pbi->tile_workers[n].hook = (VP9WorkerHook)tile_worker_hook;
VP9Worker *const worker = &pbi->tile_workers[n];
winterface->sync(worker);
worker->hook = (VP9WorkerHook)tile_worker_hook;
worker->data1 = &pbi->tile_worker_data[n];
worker->data2 = &pbi->tile_worker_info[n];
}
// Note: this memset assumes above_context[0], [1] and [2]
......@@ -1555,7 +1560,9 @@ void vp9_decode_frame(VP9Decoder *pbi,
if (!xd->corrupted) {
// If multiple threads are used to decode tiles, then we use those threads
// to do parallel loopfiltering.
vp9_loop_filter_frame_mt(new_fb, pbi, cm, cm->lf.filter_level, 0);
vp9_loop_filter_frame_mt(&pbi->lf_row_sync, new_fb, pbi->mb.plane, cm,
pbi->tile_workers, pbi->num_tile_workers,
cm->lf.filter_level, 0);
}
} else {
*p_data_end = decode_tiles(pbi, data + first_partition_size, data_end);
......
......@@ -106,9 +106,9 @@ void vp9_decoder_remove(VP9Decoder *pbi) {
for (i = 0; i < pbi->num_tile_workers; ++i) {
VP9Worker *const worker = &pbi->tile_workers[i];
vp9_get_worker_interface()->end(worker);
vpx_free(worker->data1);
vpx_free(worker->data2);
}
vpx_free(pbi->tile_worker_data);
vpx_free(pbi->tile_worker_info);
vpx_free(pbi->tile_workers);
if (pbi->num_tile_workers > 0) {
......
......@@ -46,6 +46,8 @@ typedef struct VP9Decoder {
VP9Worker lf_worker;
VP9Worker *tile_workers;
TileWorkerData *tile_worker_data;
TileInfo *tile_worker_info;
int num_tile_workers;
TileData *tile_data;
......
......@@ -92,12 +92,12 @@ static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer,
VP9_COMMON *const cm,
struct macroblockd_plane planes[MAX_MB_PLANE],
int start, int stop, int y_only,
VP9LfSync *const lf_sync, int num_lf_workers) {
VP9LfSync *const lf_sync) {
const int num_planes = y_only ? 1 : MAX_MB_PLANE;
int r, c; // SB row and col
const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;
for (r = start; r < stop; r += num_lf_workers) {
for (r = start; r < stop; r += lf_sync->num_workers) {
const int mi_row = r << MI_BLOCK_SIZE_LOG2;
MODE_INFO *const mi = cm->mi + mi_row * cm->mi_stride;
......@@ -121,35 +121,35 @@ static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer,
}
// Row-based multi-threaded loopfilter hook
static int loop_filter_row_worker(TileWorkerData *const tile_data,
void *unused) {
LFWorkerData *const lf_data = &tile_data->lfdata;
(void)unused;
static int loop_filter_row_worker(VP9LfSync *const lf_sync,
LFWorkerData *const lf_data) {
loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
lf_data->start, lf_data->stop, lf_data->y_only,
lf_data->lf_sync, lf_data->num_lf_workers);
lf_data->start, lf_data->stop, lf_data->y_only, lf_sync);
return 1;
}
// VP9 decoder: Implement multi-threaded loopfilter that uses the tile
// threads.
void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
VP9Decoder *pbi, VP9_COMMON *cm,
void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync,
YV12_BUFFER_CONFIG *frame,
struct macroblockd_plane planes[MAX_MB_PLANE],
VP9_COMMON *cm,
VP9Worker *workers, int nworkers,
int frame_filter_level,
int y_only) {
VP9LfSync *const lf_sync = &pbi->lf_row_sync;
const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
// Number of superblock rows and cols
const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
const int tile_cols = 1 << cm->log2_tile_cols;
const int num_workers = MIN(pbi->max_threads & ~1, tile_cols);
const int num_workers = MIN(nworkers, tile_cols);
int i;
if (!frame_filter_level) return;
if (!lf_sync->sync_range || cm->last_height != cm->height) {
if (!lf_sync->sync_range || cm->last_height != cm->height ||
num_workers > lf_sync->num_workers) {
vp9_loop_filter_dealloc(lf_sync);
vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width);
vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
}
vp9_loop_filter_frame_init(cm, frame_filter_level);
......@@ -158,32 +158,26 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
vpx_memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
// Set up loopfilter thread data.
// The decoder is using num_workers instead of pbi->num_tile_workers
// because it has been observed that using more threads on the
// loopfilter, than there are tile columns in the frame will hurt
// performance on Android. This is because the system will only
// schedule the tile decode workers on cores equal to the number
// of tile columns. Then if the decoder tries to use more threads for the
// loopfilter, it will hurt performance because of contention. If the
// multithreading code changes in the future then the number of workers
// used by the loopfilter should be revisited.
// The decoder is capping num_workers because it has been observed that using
// more threads on the loopfilter than there are cores will hurt performance
// on Android. This is because the system will only schedule the tile decode
// workers on cores equal to the number of tile columns. Then if the decoder
// tries to use more threads for the loopfilter, it will hurt performance
// because of contention. If the multithreading code changes in the future
// then the number of workers used by the loopfilter should be revisited.
for (i = 0; i < num_workers; ++i) {
VP9Worker *const worker = &pbi->tile_workers[i];
TileWorkerData *const tile_data = (TileWorkerData*)worker->data1;
LFWorkerData *const lf_data = &tile_data->lfdata;
VP9Worker *const worker = &workers[i];
LFWorkerData *const lf_data = &lf_sync->lfdata[i];
worker->hook = (VP9WorkerHook)loop_filter_row_worker;
worker->data1 = lf_sync;
worker->data2 = lf_data;
// Loopfilter data
lf_data->frame_buffer = frame;
lf_data->cm = cm;
vp9_copy(lf_data->planes, pbi->mb.plane);
vp9_loop_filter_data_reset(lf_data, frame, cm, planes);
lf_data->start = i;
lf_data->stop = sb_rows;
lf_data->y_only = y_only; // always do all planes in decoder
lf_data->lf_sync = lf_sync;
lf_data->num_lf_workers = num_workers;
lf_data->y_only = y_only;
// Start loopfiltering
if (i == num_workers - 1) {
......@@ -195,7 +189,7 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
// Wait till all rows are finished
for (i = 0; i < num_workers; ++i) {
winterface->sync(&pbi->tile_workers[i]);
winterface->sync(&workers[i]);
}
}
......@@ -215,7 +209,7 @@ static int get_sync_range(int width) {
// Allocate memory for lf row synchronization
void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
int width) {
int width, int num_workers) {
lf_sync->rows = rows;
#if CONFIG_MULTITHREAD
{
......@@ -239,6 +233,10 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
}
#endif // CONFIG_MULTITHREAD
CHECK_MEM_ERROR(cm, lf_sync->lfdata,
vpx_malloc(num_workers * sizeof(*lf_sync->lfdata)));
lf_sync->num_workers = num_workers;
CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,
vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));
......@@ -265,6 +263,7 @@ void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) {
vpx_free(lf_sync->cond_);
}
#endif // CONFIG_MULTITHREAD
vpx_free(lf_sync->lfdata);
vpx_free(lf_sync->cur_sb_col);
// clear the structure as the source of this call may be a resize in which
// case this call will be followed by an _alloc() which may fail.
......
......@@ -22,9 +22,6 @@ typedef struct TileWorkerData {
struct VP9Common *cm;
vp9_reader bit_reader;
DECLARE_ALIGNED(16, struct macroblockd, xd);
// Row-based parallel loopfilter data
LFWorkerData lfdata;
} TileWorkerData;
// Loopfilter row synchronization
......@@ -39,19 +36,25 @@ typedef struct VP9LfSyncData {
// determined by testing. Currently, it is chosen to be a power-of-2 number.
int sync_range;
int rows;
// Row-based parallel loopfilter data
LFWorkerData *lfdata;
int num_workers;
} VP9LfSync;
// Allocate memory for loopfilter row synchronization.
void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
int width);
int width, int num_workers);
// Deallocate loopfilter synchronization related mutex and data.
void vp9_loop_filter_dealloc(VP9LfSync *lf_sync);
// Multi-threaded loopfilter that uses the tile threads.
void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
struct VP9Decoder *pbi,
void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync,
YV12_BUFFER_CONFIG *frame,
struct macroblockd_plane planes[MAX_MB_PLANE],
struct VP9Common *cm,
VP9Worker *workers, int num_workers,
int frame_filter_level,
int y_only);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment