From 5f5552d846d3506c29d2db8454e5c70557400052 Mon Sep 17 00:00:00 2001 From: Yunqing Wang <yunqingwang@google.com> Date: Mon, 14 Mar 2016 18:59:11 -0700 Subject: [PATCH] Optimize HBD up-sampled prediction functions Optimized 2 up-sampled reference prediction functions in high-bit depth case. This reduced the HBD encoding time by 3%. Change-Id: I8663ffb5234f5e70168c0fc9ca676309fe8e98f2 --- vp10/encoder/mcomp.c | 47 +--------- vpx_dsp/variance.c | 38 ++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 7 ++ vpx_dsp/x86/highbd_variance_sse2.c | 137 +++++++++++++++++++++++++++++ vpx_dsp/x86/variance_sse2.c | 29 ++---- 5 files changed, 193 insertions(+), 65 deletions(-) diff --git a/vp10/encoder/mcomp.c b/vp10/encoder/mcomp.c index 1f147d7edd..23184ed925 100644 --- a/vp10/encoder/mcomp.c +++ b/vp10/encoder/mcomp.c @@ -685,47 +685,6 @@ static const MV search_step_table[12] = { {0, -1}, {0, 1}, {-1, 0}, {1, 0} }; -#if CONFIG_VP9_HIGHBITDEPTH -// TODO(yunqing): Optimize the following 2 functions. -static void highbd_comp_avg_upsampled_pred(uint16_t *comp_pred, - const uint8_t *pred8, - int width, int height, - const uint8_t *ref8, - int ref_stride) { - int i, j; - int stride = ref_stride << 3; - - uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - for (i = 0; i < height; ++i) { - for (j = 0; j < width; ++j) { - const int tmp = pred[j] + ref[(j << 3)]; - comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); - } - comp_pred += width; - pred += width; - ref += stride; - } -} - -static void highbd_upsampled_pred(uint16_t *comp_pred, - int width, int height, - const uint8_t *ref8, - int ref_stride) { - int i, j; - int stride = ref_stride << 3; - - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - for (i = 0; i < height; ++i) { - for (j = 0; j < width; ++j) { - comp_pred[j] = ref[(j << 3)]; - } - comp_pred += width; - ref += stride; - } -} -#endif - static int upsampled_pref_error(const MACROBLOCKD *xd, const vp10_variance_fn_ptr_t *vfp, const uint8_t *const src, const int src_stride, @@ -737,10 +696,10 @@ static int upsampled_pref_error(const MACROBLOCKD *xd, if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { DECLARE_ALIGNED(16, uint16_t, pred16[64 * 64]); if (second_pred != NULL) - highbd_comp_avg_upsampled_pred(pred16, second_pred, w, h, y, - y_stride); + vpx_highbd_comp_avg_upsampled_pred(pred16, second_pred, w, h, y, + y_stride); else - highbd_upsampled_pred(pred16, w, h, y, y_stride); + vpx_highbd_upsampled_pred(pred16, w, h, y, y_stride); besterr = vfp->vf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride, sse); diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c index ee1e3054a3..24f42df34a 100644 --- a/vpx_dsp/variance.c +++ b/vpx_dsp/variance.c @@ -651,6 +651,44 @@ void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8, ref += ref_stride; } } + +void vpx_highbd_upsampled_pred_c(uint16_t *comp_pred, + int width, int height, + const uint8_t *ref8, + int ref_stride) { + int i, j; + int stride = ref_stride << 3; + + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + comp_pred[j] = ref[(j << 3)]; + } + comp_pred += width; + ref += stride; + } +} + +void vpx_highbd_comp_avg_upsampled_pred_c(uint16_t *comp_pred, + const uint8_t *pred8, + int width, int height, + const uint8_t *ref8, + int ref_stride) { + int i, j; + int stride = ref_stride << 3; + + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int tmp = pred[j] + ref[(j << 3)]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + } + comp_pred += width; + pred += width; + ref += stride; + } +} #endif // CONFIG_VP9_HIGHBITDEPTH #if CONFIG_VP10 && CONFIG_EXT_INTER diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index e5c002a70f..ced7009870 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1238,6 +1238,13 @@ specialize qw/vpx_upsampled_pred sse2/; add_proto qw/void vpx_comp_avg_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride"; specialize qw/vpx_comp_avg_upsampled_pred sse2/; +if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { + add_proto qw/void vpx_highbd_upsampled_pred/, "uint16_t *comp_pred, int width, int height, const uint8_t *ref8, int ref_stride"; + specialize qw/vpx_highbd_upsampled_pred sse2/; + add_proto qw/void vpx_highbd_comp_avg_upsampled_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride"; + specialize qw/vpx_highbd_comp_avg_upsampled_pred sse2/; +} + # # ... # diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c index 81ec5dbdb9..e2b79bff1d 100644 --- a/vpx_dsp/x86/highbd_variance_sse2.c +++ b/vpx_dsp/x86/highbd_variance_sse2.c @@ -7,7 +7,11 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ + +#include <emmintrin.h> // SSE2 + #include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" #include "vpx_ports/mem.h" @@ -591,3 +595,136 @@ FNS(sse2); #undef FNS #undef FN #endif // CONFIG_USE_X86INC + +void vpx_highbd_upsampled_pred_sse2(uint16_t *comp_pred, + int width, int height, + const uint8_t *ref8, + int ref_stride) { + int i, j; + int stride = ref_stride << 3; + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + + if (width >= 8) { + // read 8 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j+= 8) { + __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); + __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8)); + __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16)); + __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24)); + __m128i s4 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 32)); + __m128i s5 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 40)); + __m128i s6 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 48)); + __m128i s7 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 56)); + __m128i t0, t1, t2, t3; + + t0 = _mm_unpacklo_epi16(s0, s1); + t1 = _mm_unpacklo_epi16(s2, s3); + t2 = _mm_unpacklo_epi16(s4, s5); + t3 = _mm_unpacklo_epi16(s6, s7); + t0 = _mm_unpacklo_epi32(t0, t1); + t2 = _mm_unpacklo_epi32(t2, t3); + t0 = _mm_unpacklo_epi64(t0, t2); + + _mm_storeu_si128((__m128i *)(comp_pred), t0); + comp_pred += 8; + ref += 64; // 8 * 8; + } + ref += stride - (width << 3); + } + } else { + // read 4 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j+= 4) { + __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); + __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8)); + __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16)); + __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24)); + __m128i t0, t1; + + t0 = _mm_unpacklo_epi16(s0, s1); + t1 = _mm_unpacklo_epi16(s2, s3); + t0 = _mm_unpacklo_epi32(t0, t1); + + _mm_storel_epi64((__m128i *)(comp_pred), t0); + comp_pred += 4; + ref += 4 * 8; + } + ref += stride - (width << 3); + } + } +} + +void vpx_highbd_comp_avg_upsampled_pred_sse2(uint16_t *comp_pred, + const uint8_t *pred8, + int width, int height, + const uint8_t *ref8, + int ref_stride) { + const __m128i one = _mm_set1_epi16(1); + int i, j; + int stride = ref_stride << 3; + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + + if (width >= 8) { + // read 8 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j+= 8) { + __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); + __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8)); + __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16)); + __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24)); + __m128i s4 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 32)); + __m128i s5 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 40)); + __m128i s6 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 48)); + __m128i s7 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 56)); + __m128i p0 = _mm_loadu_si128((const __m128i *)pred); + __m128i t0, t1, t2, t3; + + t0 = _mm_unpacklo_epi16(s0, s1); + t1 = _mm_unpacklo_epi16(s2, s3); + t2 = _mm_unpacklo_epi16(s4, s5); + t3 = _mm_unpacklo_epi16(s6, s7); + t0 = _mm_unpacklo_epi32(t0, t1); + t2 = _mm_unpacklo_epi32(t2, t3); + t0 = _mm_unpacklo_epi64(t0, t2); + + p0 = _mm_adds_epu16(t0, p0); + p0 = _mm_adds_epu16(p0, one); + p0 = _mm_srli_epi16(p0, 1); + + _mm_storeu_si128((__m128i *)(comp_pred), p0); + comp_pred += 8; + pred += 8; + ref += 8 * 8; + } + ref += stride - (width << 3); + } + } else { + // read 4 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j+= 4) { + __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); + __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8)); + __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16)); + __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24)); + __m128i p0 = _mm_loadl_epi64((const __m128i *)pred); + __m128i t0, t1; + + t0 = _mm_unpacklo_epi16(s0, s1); + t1 = _mm_unpacklo_epi16(s2, s3); + t0 = _mm_unpacklo_epi32(t0, t1); + + p0 = _mm_adds_epu16(t0, p0); + p0 = _mm_adds_epu16(p0, one); + p0 = _mm_srli_epi16(p0, 1); + + _mm_storel_epi64((__m128i *)(comp_pred), p0); + comp_pred += 4; + pred += 4; + ref += 4 * 8; + } + ref += stride - (width << 3); + } + } +} diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c index 63fc1e6741..dc511737cf 100644 --- a/vpx_dsp/x86/variance_sse2.c +++ b/vpx_dsp/x86/variance_sse2.c @@ -509,12 +509,11 @@ void vpx_upsampled_pred_sse2(uint8_t *comp_pred, s2 = _mm_unpacklo_epi8(t1, s3); s4 = _mm_unpacklo_epi8(t2, s5); s6 = _mm_unpacklo_epi8(t3, s7); + s0 = _mm_unpacklo_epi32(s0, s2); + s4 = _mm_unpacklo_epi32(s4, s6); + s0 = _mm_unpacklo_epi64(s0, s4); - *(int *)comp_pred = _mm_cvtsi128_si32(s0); - *(int *)(comp_pred + 4) = _mm_cvtsi128_si32(s2); - *(int *)(comp_pred + 8) = _mm_cvtsi128_si32(s4); - *(int *)(comp_pred + 12) = _mm_cvtsi128_si32(s6); - + _mm_storeu_si128((__m128i *)(comp_pred), s0); comp_pred += 16; ref += 16 * 8; } @@ -537,9 +536,9 @@ void vpx_upsampled_pred_sse2(uint8_t *comp_pred, s0 = _mm_unpacklo_epi8(t0, s1); s2 = _mm_unpacklo_epi8(t1, s3); + s0 = _mm_unpacklo_epi32(s0, s2); - *(int *)comp_pred = _mm_cvtsi128_si32(s0); - *(int *)(comp_pred + 4) = _mm_cvtsi128_si32(s2); + _mm_storel_epi64((__m128i *)(comp_pred), s0); comp_pred += 8; ref += 8 * 8; } @@ -558,7 +557,6 @@ void vpx_upsampled_pred_sse2(uint8_t *comp_pred, s0 = _mm_unpacklo_epi8(t0, s1); *(int *)comp_pred = _mm_cvtsi128_si32(s0); - comp_pred += 4; ref += 4 * 8; } @@ -621,14 +619,7 @@ void vpx_comp_avg_upsampled_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, p1 = _mm_srli_epi16(p1, 1); p0 = _mm_packus_epi16(p0, p1); - *(int *)comp_pred = _mm_cvtsi128_si32(p0); - p0 = _mm_srli_si128(p0, 4); - *(int *)(comp_pred + 4) = _mm_cvtsi128_si32(p0); - p0 = _mm_srli_si128(p0, 4); - *(int *)(comp_pred + 8) = _mm_cvtsi128_si32(p0); - p0 = _mm_srli_si128(p0, 4); - *(int *)(comp_pred + 12) = _mm_cvtsi128_si32(p0); - + _mm_storeu_si128((__m128i *)(comp_pred), p0); comp_pred += 16; pred += 16; ref += 16 * 8; @@ -662,10 +653,7 @@ void vpx_comp_avg_upsampled_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, p0 = _mm_srli_epi16(p0, 1); p0 = _mm_packus_epi16(p0, zero); - *(int *)comp_pred = _mm_cvtsi128_si32(p0); - p0 = _mm_srli_si128(p0, 4); - *(int *)(comp_pred + 4) = _mm_cvtsi128_si32(p0); - + _mm_storel_epi64((__m128i *)(comp_pred), p0); comp_pred += 8; pred += 8; ref += 8 * 8; @@ -693,7 +681,6 @@ void vpx_comp_avg_upsampled_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, p0 = _mm_packus_epi16(p0, zero); *(int *)comp_pred = _mm_cvtsi128_si32(p0); - comp_pred += 4; pred += 4; ref += 4 * 8; -- GitLab