diff --git a/test/vp9_avg_test.cc b/test/vp9_avg_test.cc index c2e472b5d8719804384df64d465e910613ef98cc..fa04528a2413c092d25b06fe24e11381913c623e 100644 --- a/test/vp9_avg_test.cc +++ b/test/vp9_avg_test.cc @@ -57,7 +57,7 @@ class AverageTestBase : public ::testing::Test { } // Sum Pixels - unsigned int ReferenceAverage(const uint8_t* source, int pitch ) { + unsigned int ReferenceAverage8x8(const uint8_t* source, int pitch ) { unsigned int average = 0; for (int h = 0; h < 8; ++h) for (int w = 0; w < 8; ++w) @@ -65,6 +65,14 @@ class AverageTestBase : public ::testing::Test { return ((average + 32) >> 6); } + unsigned int ReferenceAverage4x4(const uint8_t* source, int pitch ) { + unsigned int average = 0; + for (int h = 0; h < 4; ++h) + for (int w = 0; w < 4; ++w) + average += source[h * source_stride_ + w]; + return ((average + 8) >> 4); + } + void FillConstant(uint8_t fill_constant) { for (int i = 0; i < width_ * height_; ++i) { source_data_[i] = fill_constant; @@ -85,7 +93,7 @@ class AverageTestBase : public ::testing::Test { }; typedef unsigned int (*AverageFunction)(const uint8_t* s, int pitch); -typedef std::tr1::tuple AvgFunc; +typedef std::tr1::tuple AvgFunc; class AverageTest : public AverageTestBase, @@ -95,12 +103,18 @@ class AverageTest protected: void CheckAverages() { - unsigned int expected = ReferenceAverage(source_data_+ GET_PARAM(2), - source_stride_); + unsigned int expected = 0; + if (GET_PARAM(3) == 8) { + expected = ReferenceAverage8x8(source_data_+ GET_PARAM(2), + source_stride_); + } else if (GET_PARAM(3) == 4) { + expected = ReferenceAverage4x4(source_data_+ GET_PARAM(2), + source_stride_); + } - ASM_REGISTER_STATE_CHECK(GET_PARAM(3)(source_data_+ GET_PARAM(2), + ASM_REGISTER_STATE_CHECK(GET_PARAM(4)(source_data_+ GET_PARAM(2), source_stride_)); - unsigned int actual = GET_PARAM(3)(source_data_+ GET_PARAM(2), + unsigned int actual = GET_PARAM(4)(source_data_+ GET_PARAM(2), source_stride_); EXPECT_EQ(expected, actual); @@ -134,16 +148,20 @@ using std::tr1::make_tuple; INSTANTIATE_TEST_CASE_P( C, AverageTest, ::testing::Values( - make_tuple(16, 16, 1, &vp9_avg_8x8_c))); + make_tuple(16, 16, 1, 8, &vp9_avg_8x8_c), + make_tuple(16, 16, 1, 4, &vp9_avg_4x4_c))); #if HAVE_SSE2 INSTANTIATE_TEST_CASE_P( SSE2, AverageTest, ::testing::Values( - make_tuple(16, 16, 0, &vp9_avg_8x8_sse2), - make_tuple(16, 16, 5, &vp9_avg_8x8_sse2), - make_tuple(32, 32, 15, &vp9_avg_8x8_sse2))); + make_tuple(16, 16, 0, 8, &vp9_avg_8x8_sse2), + make_tuple(16, 16, 5, 8, &vp9_avg_8x8_sse2), + make_tuple(32, 32, 15, 8, &vp9_avg_8x8_sse2), + make_tuple(16, 16, 0, 4, &vp9_avg_4x4_sse2), + make_tuple(16, 16, 5, 4, &vp9_avg_4x4_sse2), + make_tuple(32, 32, 15, 4, &vp9_avg_4x4_sse2))); #endif diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index ae12808640213a5e70bade1d74f58633bb93c3b7..281dcbd8b0de653d3dac662de05c49fdd74e26f4 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -1135,9 +1135,14 @@ specialize qw/vp9_get_mb_ss/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p"; specialize qw/vp9_avg_8x8 sse2/; +add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p"; +specialize qw/vp9_avg_4x4 sse2/; + if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/unsigned int vp9_highbd_avg_8x8/, "const uint8_t *, int p"; specialize qw/vp9_highbd_avg_8x8/; + add_proto qw/unsigned int vp9_highbd_avg_4x4/, "const uint8_t *, int p"; + specialize qw/vp9_highbd_avg_4x4/; } # ENCODEMB INVOKE diff --git a/vp9/encoder/vp9_avg.c b/vp9/encoder/vp9_avg.c index e9810c894d3b26fa106d0b1a43c48b9d3c153279..f8fa7d2e8dfda96b97e18ab51ec29d25fe8e6900 100644 --- a/vp9/encoder/vp9_avg.c +++ b/vp9/encoder/vp9_avg.c @@ -19,6 +19,15 @@ unsigned int vp9_avg_8x8_c(const uint8_t *s, int p) { return (sum + 32) >> 6; } +unsigned int vp9_avg_4x4_c(const uint8_t *s, int p) { + int i, j; + int sum = 0; + for (i = 0; i < 4; ++i, s+=p) + for (j = 0; j < 4; sum += s[j], ++j) {} + + return (sum + 8) >> 4; +} + #if CONFIG_VP9_HIGHBITDEPTH unsigned int vp9_highbd_avg_8x8_c(const uint8_t *s8, int p) { int i, j; @@ -29,5 +38,16 @@ unsigned int vp9_highbd_avg_8x8_c(const uint8_t *s8, int p) { return (sum + 32) >> 6; } + +unsigned int vp9_highbd_avg_4x4_c(const uint8_t *s8, int p) { + int i, j; + int sum = 0; + const uint16_t* s = CONVERT_TO_SHORTPTR(s8); + for (i = 0; i < 4; ++i, s+=p) + for (j = 0; j < 4; sum += s[j], ++j) {} + + return (sum + 8) >> 4; +} #endif // CONFIG_VP9_HIGHBITDEPTH + diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index d5122d0bcb4f1ab5beff566cd8c65e4b4a325cec..7788e502d3d8078f3c582af0569e0c92836bab24 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -291,6 +291,11 @@ typedef struct { typedef struct { partition_variance part_variances; var split[4]; +} v4x4; + +typedef struct { + partition_variance part_variances; + v4x4 split[4]; } v8x8; typedef struct { @@ -348,6 +353,13 @@ static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) { case BLOCK_8X8: { v8x8 *vt = (v8x8 *) data; node->part_variances = &vt->part_variances; + for (i = 0; i < 4; i++) + node->split[i] = &vt->split[i].part_variances.none; + break; + } + case BLOCK_4X4: { + v4x4 *vt = (v4x4 *) data; + node->part_variances = &vt->part_variances; for (i = 0; i < 4; i++) node->split[i] = &vt->split[i]; break; @@ -398,64 +410,76 @@ static int set_vt_partitioning(VP9_COMP *cpi, variance_node vt; const int block_width = num_8x8_blocks_wide_lookup[bsize]; const int block_height = num_8x8_blocks_high_lookup[bsize]; - // TODO(debargha): Choose this more intelligently. - const int threshold_multiplier = cm->frame_type == KEY_FRAME ? 64 : 4; + // TODO(marpan): Adjust/tune these thresholds. + const int threshold_multiplier = cm->frame_type == KEY_FRAME ? 80 : 4; int64_t threshold = (int64_t)(threshold_multiplier * vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth)); + int64_t threshold_bsize_ref = threshold << 6; + int64_t threshold_low = threshold; + BLOCK_SIZE bsize_ref = BLOCK_16X16; + assert(block_height == block_width); tree_to_node(data, bsize, &vt); - // Split none is available only if we have more than half a block size - // in width and height inside the visible image. - if (mi_col + block_width / 2 < cm->mi_cols && - mi_row + block_height / 2 < cm->mi_rows && - vt.part_variances->none.variance < threshold) { - set_block_size(cpi, xd, mi_row, mi_col, bsize); - return 1; + if (cm->frame_type == KEY_FRAME) { + bsize_ref = BLOCK_8X8; + // Choose lower thresholds for key frame variance to favor split. + threshold_bsize_ref = threshold >> 1; + threshold_low = threshold >> 2; } - // Only allow split for blocks above 16x16. - if (bsize > BLOCK_16X16) { - // Vertical split is available on all but the bottom border. + // For bsize=bsize_ref (16x16/8x8 for 8x8/4x4 downsampling), select if + // variance is below threshold, otherwise split will be selected. + // No check for vert/horiz split as too few samples for variance. + if (bsize == bsize_ref) { + if (mi_col + block_width / 2 < cm->mi_cols && + mi_row + block_height / 2 < cm->mi_rows && + vt.part_variances->none.variance < threshold_bsize_ref) { + set_block_size(cpi, xd, mi_row, mi_col, bsize); + return 1; + } + return 0; + } else if (bsize > bsize_ref) { + // For key frame, for bsize above 32X32, or very high variance, take split. + if (cm->frame_type == KEY_FRAME && + (bsize > BLOCK_32X32 || + vt.part_variances->none.variance > (threshold << 2))) { + return 0; + } + // If variance is low, take the bsize (no split). + if (mi_col + block_width / 2 < cm->mi_cols && + mi_row + block_height / 2 < cm->mi_rows && + vt.part_variances->none.variance < threshold_low) { + set_block_size(cpi, xd, mi_row, mi_col, bsize); + return 1; + } + // Check vertical split. if (mi_row + block_height / 2 < cm->mi_rows && - vt.part_variances->vert[0].variance < threshold && - vt.part_variances->vert[1].variance < threshold) { + vt.part_variances->vert[0].variance < threshold_low && + vt.part_variances->vert[1].variance < threshold_low) { BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT); set_block_size(cpi, xd, mi_row, mi_col, subsize); set_block_size(cpi, xd, mi_row, mi_col + block_width / 2, subsize); return 1; } - - // Horizontal split is available on all but the right border. + // Check horizontal split. if (mi_col + block_width / 2 < cm->mi_cols && - vt.part_variances->horz[0].variance < threshold && - vt.part_variances->horz[1].variance < threshold) { + vt.part_variances->horz[0].variance < threshold_low && + vt.part_variances->horz[1].variance < threshold_low) { BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ); set_block_size(cpi, xd, mi_row, mi_col, subsize); set_block_size(cpi, xd, mi_row + block_height / 2, mi_col, subsize); return 1; } - } - - // This will only allow 8x8 if the 16x16 variance is very large. - if (bsize == BLOCK_16X16) { - if (mi_col + block_width / 2 < cm->mi_cols && - mi_row + block_height / 2 < cm->mi_rows && - vt.part_variances->none.variance < (threshold << 6)) { - set_block_size(cpi, xd, mi_row, mi_col, bsize); - return 1; - } + return 0; } return 0; } -// This function chooses partitioning based on the variance -// between source and reconstructed last, where variance is -// computed for 8x8 downsampled inputs. Some things to check: -// using the last source rather than reconstructed last, and -// allowing for small downsampling (4x4 or 2x2) for selection -// of smaller block sizes (i.e., < 16x16). +// This function chooses partitioning based on the variance between source and +// reconstructed last, where variance is computed for downsampled inputs. +// Currently 8x8 downsampling is used for delta frames, 4x4 for key frames. static void choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, MACROBLOCK *x, @@ -463,7 +487,7 @@ static void choose_partitioning(VP9_COMP *cpi, VP9_COMMON * const cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; - int i, j, k; + int i, j, k, m; v64x64 vt; uint8_t *s; const uint8_t *d; @@ -525,38 +549,63 @@ static void choose_partitioning(VP9_COMP *cpi, const int y16_idx = y32_idx + ((j >> 1) << 4); v16x16 *vst = &vt.split[i].split[j]; for (k = 0; k < 4; k++) { - int x_idx = x16_idx + ((k & 1) << 3); - int y_idx = y16_idx + ((k >> 1) << 3); - unsigned int sse = 0; - int sum = 0; - - if (x_idx < pixels_wide && y_idx < pixels_high) { - int s_avg, d_avg; + int x8_idx = x16_idx + ((k & 1) << 3); + int y8_idx = y16_idx + ((k >> 1) << 3); + if (cm->frame_type != KEY_FRAME) { + unsigned int sse = 0; + int sum = 0; + if (x8_idx < pixels_wide && y8_idx < pixels_high) { + int s_avg, d_avg; #if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - s_avg = vp9_highbd_avg_8x8(s + y_idx * sp + x_idx, sp); - d_avg = vp9_highbd_avg_8x8(d + y_idx * dp + x_idx, dp); - } else { - s_avg = vp9_avg_8x8(s + y_idx * sp + x_idx, sp); - d_avg = vp9_avg_8x8(d + y_idx * dp + x_idx, dp); - } + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + s_avg = vp9_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp); + d_avg = vp9_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp); + } else { + s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp); + d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp); + } #else - s_avg = vp9_avg_8x8(s + y_idx * sp + x_idx, sp); - d_avg = vp9_avg_8x8(d + y_idx * dp + x_idx, dp); + s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp); + d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp); #endif - sum = s_avg - d_avg; - sse = sum * sum; + sum = s_avg - d_avg; + sse = sum * sum; + } + // If variance is based on 8x8 downsampling, we stop here and have + // one sample for 8x8 block (so use 1 for count in fill_variance), + // which of course means variance = 0 for 8x8 block. + fill_variance(sse, sum, 1, &vst->split[k].part_variances.none); + } else { + // For key frame, go down to 4x4. + v8x8 *vst2 = &vst->split[k]; + for (m = 0; m < 4; m++) { + int x4_idx = x8_idx + ((m & 1) << 2); + int y4_idx = y8_idx + ((m >> 1) << 2); + unsigned int sse = 0; + int sum = 0; + if (x4_idx < pixels_wide && y4_idx < pixels_high) { + int s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp); + // For key frame, reference is set to 128. + sum = s_avg - 128; + sse = sum * sum; + } + // If variance is based on 4x4 downsampling, we stop here and have + // one sample for 4x4 block (so use 1 for count in fill_variance), + // which of course means variance = 0 for 4x4 block. + fill_variance(sse, sum, 1, &vst2->split[m].part_variances.none); + } } - // For an 8x8 block we have just one value the average of all 64 - // pixels, so use 1. This means of course that there is no variance - // in an 8x8 block. - fill_variance(sse, sum, 1, &vst->split[k].part_variances.none); } } } // Fill the rest of the variance tree by summing split partition values. for (i = 0; i < 4; i++) { for (j = 0; j < 4; j++) { + if (cm->frame_type == KEY_FRAME) { + for (m = 0; m < 4; m++) { + fill_variance_tree(&vt.split[i].split[j].split[m], BLOCK_8X8); + } + } fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16); } fill_variance_tree(&vt.split[i], BLOCK_32X32); @@ -564,8 +613,7 @@ static void choose_partitioning(VP9_COMP *cpi, fill_variance_tree(&vt, BLOCK_64X64); // Now go through the entire structure, splitting every block size until - // we get to one that's got a variance lower than our threshold, or we - // hit 8x8. + // we get to one that's got a variance lower than our threshold. if ( mi_col + 8 > cm->mi_cols || mi_row + 8 > cm->mi_rows || !set_vt_partitioning(cpi, xd, &vt, BLOCK_64X64, mi_row, mi_col)) { for (i = 0; i < 4; ++i) { @@ -576,11 +624,13 @@ static void choose_partitioning(VP9_COMP *cpi, for (j = 0; j < 4; ++j) { const int x16_idx = ((j & 1) << 1); const int y16_idx = ((j >> 1) << 1); - // NOTE: Since this uses 8x8 downsampling for variance calculation - // we cannot really select block size 8x8 (or even 8x16/16x8), - // since we do not sufficient samples for variance. - // For now, 8x8 partition is only set if the variance of the 16x16 - // block is very high. This is controlled in set_vt_partitioning. + // Note: If 8x8 downsampling is used for variance calculation we + // cannot really select block size 8x8 (or even 8x16/16x8), since we + // don't have sufficient samples for variance. So on delta frames, + // 8x8 partition is only set if variance of the 16x16 block is very + // high. For key frames, 4x4 downsampling is used, so we can better + // select 8x16/16x8 and 8x8. 4x4 partition can potentially be set + // used here too, but for now 4x4 is not allowed. if (!set_vt_partitioning(cpi, xd, &vt.split[i].split[j], BLOCK_16X16, mi_row + y32_idx + y16_idx, @@ -588,10 +638,26 @@ static void choose_partitioning(VP9_COMP *cpi, for (k = 0; k < 4; ++k) { const int x8_idx = (k & 1); const int y8_idx = (k >> 1); - set_block_size(cpi, xd, - (mi_row + y32_idx + y16_idx + y8_idx), - (mi_col + x32_idx + x16_idx + x8_idx), - BLOCK_8X8); + // TODO(marpan): Allow for setting 4x4 partition on key frame. + /* + if (cm->frame_type == KEY_FRAME) { + if (!set_vt_partitioning(cpi, xd, + &vt.split[i].split[j].split[k], + BLOCK_8X8, + mi_row + y32_idx + y16_idx + y8_idx, + mi_col + x32_idx + x16_idx + x8_idx)) { + set_block_size(cpi, xd, + (mi_row + y32_idx + y16_idx + y8_idx), + (mi_col + x32_idx + x16_idx + x8_idx), + BLOCK_4X4); + } + } else { + */ + set_block_size(cpi, xd, + (mi_row + y32_idx + y16_idx + y8_idx), + (mi_col + x32_idx + x16_idx + x8_idx), + BLOCK_8X8); + // } } } } @@ -2511,7 +2577,7 @@ static void encode_rd_sb_row(VP9_COMP *cpi, rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64, &dummy_rate, &dummy_dist, 1, td->pc_root); } else if (sf->partition_search_type == VAR_BASED_PARTITION && - cm->frame_type != KEY_FRAME ) { + cm->frame_type != KEY_FRAME) { choose_partitioning(cpi, tile_info, x, mi_row, mi_col); rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64, &dummy_rate, &dummy_dist, 1, td->pc_root); @@ -3532,6 +3598,11 @@ static void encode_frame_internal(VP9_COMP *cpi) { cm->uv_ac_delta_q == 0; cm->tx_mode = select_tx_mode(cpi, xd); + if (cm->frame_type == KEY_FRAME && + cpi->sf.use_nonrd_pick_mode && + cpi->sf.partition_search_type == VAR_BASED_PARTITION) { + cm->tx_mode = ALLOW_16X16; + } #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 5c70b4ee464792b15cd76cd3b037fdb8e991c134..85d0dba599c147176a56c5cdce42c6f70701350c 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -321,7 +321,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, sf->partition_search_type = VAR_BASED_PARTITION; // Turn on this to use non-RD key frame coding mode. - // sf->use_nonrd_pick_mode = 1; + sf->use_nonrd_pick_mode = 1; sf->mv.search_method = NSTEP; sf->tx_size_search_method = is_keyframe ? USE_LARGESTALL : USE_TX_8X8; sf->mv.reduce_first_step_size = 1; diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vp9/encoder/x86/vp9_avg_intrin_sse2.c index ca6cf1ac91667168d2f2265e3cea3a64bc9846ef..4c3495b056033380e5c99f27914b6f4e36362abb 100644 --- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c +++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c @@ -38,3 +38,21 @@ unsigned int vp9_avg_8x8_sse2(const uint8_t *s, int p) { avg = _mm_extract_epi16(s0, 0); return (avg + 32) >> 6; } + +unsigned int vp9_avg_4x4_sse2(const uint8_t *s, int p) { + __m128i s0, s1, u0; + unsigned int avg = 0; + u0 = _mm_setzero_si128(); + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + + s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4)); + s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16)); + avg = _mm_extract_epi16(s0, 0); + return (avg + 8) >> 4; +}