Commit 78842b28 authored by Debargha Mukherjee's avatar Debargha Mukherjee Committed by Gerrit Code Review
Browse files

Merge "Reinstate "Optimize wedge partition selection." without tests." into nextgenv2

parents c797e709 135d6631
......@@ -2447,7 +2447,6 @@ static void build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, int plane,
int wedge_offset_x,
int wedge_offset_y,
#endif // CONFIG_SUPERTX
int mi_x, int mi_y,
uint8_t *ext_dst0,
int ext_dst_stride0,
uint8_t *ext_dst1,
......@@ -2461,8 +2460,6 @@ static void build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, int plane,
(void) block;
(void) bw;
(void) bh;
(void) mi_x;
(void) mi_y;
if (is_compound
&& is_interinter_wedge_used(mbmi->sb_type)
......@@ -2526,12 +2523,9 @@ static void build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, int plane,
void vp10_build_wedge_inter_predictor_from_buf(
MACROBLOCKD *xd, BLOCK_SIZE bsize,
int plane_from, int plane_to,
int mi_row, int mi_col,
uint8_t *ext_dst0[3], int ext_dst_stride0[3],
uint8_t *ext_dst1[3], int ext_dst_stride1[3]) {
int plane;
const int mi_x = mi_col * MI_SIZE;
const int mi_y = mi_row * MI_SIZE;
for (plane = plane_from; plane <= plane_to; ++plane) {
const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
&xd->plane[plane]);
......@@ -2550,7 +2544,6 @@ void vp10_build_wedge_inter_predictor_from_buf(
#if CONFIG_SUPERTX
0, 0,
#endif
mi_x, mi_y,
ext_dst0[plane],
ext_dst_stride0[plane],
ext_dst1[plane],
......@@ -2561,7 +2554,6 @@ void vp10_build_wedge_inter_predictor_from_buf(
#if CONFIG_SUPERTX
0, 0,
#endif
mi_x, mi_y,
ext_dst0[plane],
ext_dst_stride0[plane],
ext_dst1[plane],
......
......@@ -652,7 +652,6 @@ void vp10_build_inter_predictors_for_planes_single_buf(
void vp10_build_wedge_inter_predictor_from_buf(
MACROBLOCKD *xd, BLOCK_SIZE bsize,
int plane_from, int plane_to,
int mi_row, int mi_col,
uint8_t *ext_dst0[3], int ext_dst_stride0[3],
uint8_t *ext_dst1[3], int ext_dst_stride1[3]);
#endif // CONFIG_EXT_INTER
......
......@@ -725,6 +725,15 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
}
# End vp10_high encoder functions
if (vpx_config("CONFIG_EXT_INTER") eq "yes") {
add_proto qw/uint64_t vp10_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N";
specialize qw/vp10_wedge_sse_from_residuals sse2/;
add_proto qw/int vp10_wedge_sign_from_residuals/, "const int16_t *ds, const uint8_t *m, int N, int64_t limit";
specialize qw/vp10_wedge_sign_from_residuals sse2/;
add_proto qw/void vp10_wedge_compute_delta_squares/, "int16_t *d, const int16_t *a, const int16_t *b, int N";
specialize qw/vp10_wedge_compute_delta_squares sse2/;
}
}
# end encoder functions
1;
This diff is collapsed.
/*
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <assert.h>
#include "vpx/vpx_integer.h"
#include "vpx_ports/mem.h"
#include "vpx_dsp/vpx_dsp_common.h"
#include "vp10/common/reconinter.h"
#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
/**
* Computes SSE of a compound predictor constructed from 2 fundamental
* predictors p0 and p1 using blending with mask.
*
* r1: Residuals of p1.
* (source - p1)
* d: Difference of p1 and p0.
* (p1 - p0)
* m: The blending mask
* N: Number of pixels
*
* 'r1', 'd', and 'm' are contiguous.
*
* Computes:
* Sum((MAX_MASK_VALUE*r1 + mask*d)**2), which is equivalent to:
* Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2),
* where r0 is (source - p0), and r1 is (source - p1), which is in turn
* is equivalent to:
* Sum((source*MAX_MASK_VALUE - (mask*p0 + (MAX_MASK_VALUE-mask)*p1))**2),
* which is the SSE of the residuals of the compound predictor scaled up by
* MAX_MASK_VALUE**2.
*
* Note that we clamp the partial term in the loop to 16 bits signed. This is
* to facilitate equivalent SIMD implementation. It should have no effect if
* residuals are within 16 - WEDGE_WEIGHT_BITS (=10) signed, which always
* holds for 8 bit input, and on real input, it should hold practically always,
* as residuals are expected to be small.
*/
uint64_t vp10_wedge_sse_from_residuals_c(const int16_t *r1,
const int16_t *d,
const uint8_t *m,
int N) {
uint64_t csse = 0;
int i;
assert(N % 64 == 0);
for (i = 0 ; i < N ; i++) {
int32_t t = MAX_MASK_VALUE*r1[i] + m[i]*d[i];
t = clamp(t, INT16_MIN, INT16_MAX);
csse += t*t;
}
return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
}
/**
* Choose the mask sign for a compound predictor.
*
* ds: Difference of the squares of the residuals.
* r0**2 - r1**2
* m: The blending mask
* N: Number of pixels
* limit: Pre-computed threshold value.
* MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2))
*
* 'ds' and 'm' are contiguous.
*
* Returns true if the negated mask has lower SSE compared to the positive
* mask. Computation is based on:
* Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2)
* >
* Sum(((MAX_MASK_VALUE-mask)*r0 + mask*r1)**2)
*
* which can be simplified to:
*
* Sum(mask*(r0**2 - r1**2)) > MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2))
*
* The right hand side does not depend on the mask, and needs to be passed as
* the 'limit' parameter.
*
* After pre-computing (r0**2 - r1**2), which is passed in as 'ds', the left
* hand side is simply a scalar product between an int16_t and uint8_t vector.
*
* Note that for efficiency, ds is stored on 16 bits. Real input residuals
* being small, this should not cause a noticeable issue.
*/
int vp10_wedge_sign_from_residuals_c(const int16_t *ds,
const uint8_t *m,
int N,
int64_t limit) {
int64_t acc = 0;
assert(N % 64 == 0);
do {
acc += *ds++ * *m++;
} while (--N);
return acc > limit;
}
/**
* Compute the element-wise difference of the squares of 2 arrays.
*
* d: Difference of the squares of the inputs: a**2 - b**2
* a: First input array
* b: Second input array
* N: Number of elements
*
* 'd', 'a', and 'b' are contiguous.
*
* The result is saturated to signed 16 bits.
*/
void vp10_wedge_compute_delta_squares_c(int16_t *d,
const int16_t *a,
const int16_t *b,
int N) {
int i;
assert(N % 64 == 0);
for (i = 0 ; i < N ; i++)
d[i] = clamp(a[i]*a[i] - b[i]*b[i], INT16_MIN, INT16_MAX);
}
/*
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <assert.h>
#include <immintrin.h>
#include "vpx_dsp/x86/synonyms.h"
#include "vpx/vpx_integer.h"
#include "vp10/common/reconinter.h"
#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
/**
* See vp10_wedge_sse_from_residuals_c
*/
uint64_t vp10_wedge_sse_from_residuals_sse2(const int16_t *r1,
const int16_t *d,
const uint8_t *m,
int N) {
int n = -N;
int n8 = n + 8;
uint64_t csse;
const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE);
const __m128i v_zext_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
__m128i v_acc0_q = _mm_setzero_si128();
assert(N % 64 == 0);
r1 += N;
d += N;
m += N;
do {
const __m128i v_r0_w = xx_load_128(r1 + n);
const __m128i v_r1_w = xx_load_128(r1 + n8);
const __m128i v_d0_w = xx_load_128(d + n);
const __m128i v_d1_w = xx_load_128(d + n8);
const __m128i v_m01_b = xx_load_128(m + n);
const __m128i v_rd0l_w = _mm_unpacklo_epi16(v_d0_w, v_r0_w);
const __m128i v_rd0h_w = _mm_unpackhi_epi16(v_d0_w, v_r0_w);
const __m128i v_rd1l_w = _mm_unpacklo_epi16(v_d1_w, v_r1_w);
const __m128i v_rd1h_w = _mm_unpackhi_epi16(v_d1_w, v_r1_w);
const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128());
const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128());
const __m128i v_m0l_w = _mm_unpacklo_epi16(v_m0_w, v_mask_max_w);
const __m128i v_m0h_w = _mm_unpackhi_epi16(v_m0_w, v_mask_max_w);
const __m128i v_m1l_w = _mm_unpacklo_epi16(v_m1_w, v_mask_max_w);
const __m128i v_m1h_w = _mm_unpackhi_epi16(v_m1_w, v_mask_max_w);
const __m128i v_t0l_d = _mm_madd_epi16(v_rd0l_w, v_m0l_w);
const __m128i v_t0h_d = _mm_madd_epi16(v_rd0h_w, v_m0h_w);
const __m128i v_t1l_d = _mm_madd_epi16(v_rd1l_w, v_m1l_w);
const __m128i v_t1h_d = _mm_madd_epi16(v_rd1h_w, v_m1h_w);
const __m128i v_t0_w = _mm_packs_epi32(v_t0l_d, v_t0h_d);
const __m128i v_t1_w = _mm_packs_epi32(v_t1l_d, v_t1h_d);
const __m128i v_sq0_d = _mm_madd_epi16(v_t0_w, v_t0_w);
const __m128i v_sq1_d = _mm_madd_epi16(v_t1_w, v_t1_w);
const __m128i v_sum0_q = _mm_add_epi64(_mm_and_si128(v_sq0_d, v_zext_q),
_mm_srli_epi64(v_sq0_d, 32));
const __m128i v_sum1_q = _mm_add_epi64(_mm_and_si128(v_sq1_d, v_zext_q),
_mm_srli_epi64(v_sq1_d, 32));
v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum0_q);
v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum1_q);
n8 += 16;
n += 16;
} while (n);
v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));
#if ARCH_X86_64
csse = (uint64_t)_mm_cvtsi128_si64(v_acc0_q);
#else
xx_storel_64(&csse, v_acc0_q);
#endif
return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
}
/**
* See vp10_wedge_sign_from_residuals_c
*/
int vp10_wedge_sign_from_residuals_sse2(const int16_t *ds,
const uint8_t *m,
int N,
int64_t limit) {
int64_t acc;
__m128i v_sign_d;
__m128i v_acc0_d = _mm_setzero_si128();
__m128i v_acc1_d = _mm_setzero_si128();
__m128i v_acc_q;
// Input size limited to 8192 by the use of 32 bit accumulators and m
// being between [0, 64]. Overflow might happen at larger sizes,
// though it is practically impossible on real video input.
assert(N < 8192);
assert(N % 64 == 0);
do {
const __m128i v_m01_b = xx_load_128(m);
const __m128i v_m23_b = xx_load_128(m + 16);
const __m128i v_m45_b = xx_load_128(m + 32);
const __m128i v_m67_b = xx_load_128(m + 48);
const __m128i v_d0_w = xx_load_128(ds);
const __m128i v_d1_w = xx_load_128(ds + 8);
const __m128i v_d2_w = xx_load_128(ds + 16);
const __m128i v_d3_w = xx_load_128(ds + 24);
const __m128i v_d4_w = xx_load_128(ds + 32);
const __m128i v_d5_w = xx_load_128(ds + 40);
const __m128i v_d6_w = xx_load_128(ds + 48);
const __m128i v_d7_w = xx_load_128(ds + 56);
const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128());
const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128());
const __m128i v_m2_w = _mm_unpacklo_epi8(v_m23_b, _mm_setzero_si128());
const __m128i v_m3_w = _mm_unpackhi_epi8(v_m23_b, _mm_setzero_si128());
const __m128i v_m4_w = _mm_unpacklo_epi8(v_m45_b, _mm_setzero_si128());
const __m128i v_m5_w = _mm_unpackhi_epi8(v_m45_b, _mm_setzero_si128());
const __m128i v_m6_w = _mm_unpacklo_epi8(v_m67_b, _mm_setzero_si128());
const __m128i v_m7_w = _mm_unpackhi_epi8(v_m67_b, _mm_setzero_si128());
const __m128i v_p0_d = _mm_madd_epi16(v_d0_w, v_m0_w);
const __m128i v_p1_d = _mm_madd_epi16(v_d1_w, v_m1_w);
const __m128i v_p2_d = _mm_madd_epi16(v_d2_w, v_m2_w);
const __m128i v_p3_d = _mm_madd_epi16(v_d3_w, v_m3_w);
const __m128i v_p4_d = _mm_madd_epi16(v_d4_w, v_m4_w);
const __m128i v_p5_d = _mm_madd_epi16(v_d5_w, v_m5_w);
const __m128i v_p6_d = _mm_madd_epi16(v_d6_w, v_m6_w);
const __m128i v_p7_d = _mm_madd_epi16(v_d7_w, v_m7_w);
const __m128i v_p01_d = _mm_add_epi32(v_p0_d, v_p1_d);
const __m128i v_p23_d = _mm_add_epi32(v_p2_d, v_p3_d);
const __m128i v_p45_d = _mm_add_epi32(v_p4_d, v_p5_d);
const __m128i v_p67_d = _mm_add_epi32(v_p6_d, v_p7_d);
const __m128i v_p0123_d = _mm_add_epi32(v_p01_d, v_p23_d);
const __m128i v_p4567_d = _mm_add_epi32(v_p45_d, v_p67_d);
v_acc0_d = _mm_add_epi32(v_acc0_d, v_p0123_d);
v_acc1_d = _mm_add_epi32(v_acc1_d, v_p4567_d);
ds += 64;
m += 64;
N -= 64;
} while (N);
v_sign_d = _mm_cmplt_epi32(v_acc0_d, _mm_setzero_si128());
v_acc0_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc0_d, v_sign_d),
_mm_unpackhi_epi32(v_acc0_d, v_sign_d));
v_sign_d = _mm_cmplt_epi32(v_acc1_d, _mm_setzero_si128());
v_acc1_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc1_d, v_sign_d),
_mm_unpackhi_epi32(v_acc1_d, v_sign_d));
v_acc_q = _mm_add_epi64(v_acc0_d, v_acc1_d);
v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
#if ARCH_X86_64
acc = (uint64_t)_mm_cvtsi128_si64(v_acc_q);
#else
xx_storel_64(&acc, v_acc_q);
#endif
return acc > limit;
}
// Negate under mask
static INLINE __m128i negm_epi16(__m128i v_v_w, __m128i v_mask_w) {
return _mm_sub_epi16(_mm_xor_si128(v_v_w, v_mask_w), v_mask_w);
}
/**
* vp10_wedge_compute_delta_squares_c
*/
void vp10_wedge_compute_delta_squares_sse2(int16_t *d,
const int16_t *a,
const int16_t *b,
int N) {
const __m128i v_neg_w = _mm_set_epi16(0xffff, 0, 0xffff, 0,
0xffff, 0, 0xffff, 0);
assert(N % 64 == 0);
do {
const __m128i v_a0_w = xx_load_128(a);
const __m128i v_b0_w = xx_load_128(b);
const __m128i v_a1_w = xx_load_128(a + 8);
const __m128i v_b1_w = xx_load_128(b + 8);
const __m128i v_a2_w = xx_load_128(a + 16);
const __m128i v_b2_w = xx_load_128(b + 16);
const __m128i v_a3_w = xx_load_128(a + 24);
const __m128i v_b3_w = xx_load_128(b + 24);
const __m128i v_ab0l_w = _mm_unpacklo_epi16(v_a0_w, v_b0_w);
const __m128i v_ab0h_w = _mm_unpackhi_epi16(v_a0_w, v_b0_w);
const __m128i v_ab1l_w = _mm_unpacklo_epi16(v_a1_w, v_b1_w);
const __m128i v_ab1h_w = _mm_unpackhi_epi16(v_a1_w, v_b1_w);
const __m128i v_ab2l_w = _mm_unpacklo_epi16(v_a2_w, v_b2_w);
const __m128i v_ab2h_w = _mm_unpackhi_epi16(v_a2_w, v_b2_w);
const __m128i v_ab3l_w = _mm_unpacklo_epi16(v_a3_w, v_b3_w);
const __m128i v_ab3h_w = _mm_unpackhi_epi16(v_a3_w, v_b3_w);
// Negate top word of pairs
const __m128i v_abl0n_w = negm_epi16(v_ab0l_w, v_neg_w);
const __m128i v_abh0n_w = negm_epi16(v_ab0h_w, v_neg_w);
const __m128i v_abl1n_w = negm_epi16(v_ab1l_w, v_neg_w);
const __m128i v_abh1n_w = negm_epi16(v_ab1h_w, v_neg_w);
const __m128i v_abl2n_w = negm_epi16(v_ab2l_w, v_neg_w);
const __m128i v_abh2n_w = negm_epi16(v_ab2h_w, v_neg_w);
const __m128i v_abl3n_w = negm_epi16(v_ab3l_w, v_neg_w);
const __m128i v_abh3n_w = negm_epi16(v_ab3h_w, v_neg_w);
const __m128i v_r0l_w = _mm_madd_epi16(v_ab0l_w, v_abl0n_w);
const __m128i v_r0h_w = _mm_madd_epi16(v_ab0h_w, v_abh0n_w);
const __m128i v_r1l_w = _mm_madd_epi16(v_ab1l_w, v_abl1n_w);
const __m128i v_r1h_w = _mm_madd_epi16(v_ab1h_w, v_abh1n_w);
const __m128i v_r2l_w = _mm_madd_epi16(v_ab2l_w, v_abl2n_w);
const __m128i v_r2h_w = _mm_madd_epi16(v_ab2h_w, v_abh2n_w);
const __m128i v_r3l_w = _mm_madd_epi16(v_ab3l_w, v_abl3n_w);
const __m128i v_r3h_w = _mm_madd_epi16(v_ab3h_w, v_abh3n_w);
const __m128i v_r0_w = _mm_packs_epi32(v_r0l_w, v_r0h_w);
const __m128i v_r1_w = _mm_packs_epi32(v_r1l_w, v_r1h_w);
const __m128i v_r2_w = _mm_packs_epi32(v_r2l_w, v_r2h_w);
const __m128i v_r3_w = _mm_packs_epi32(v_r3l_w, v_r3h_w);
xx_store_128(d, v_r0_w);
xx_store_128(d + 8, v_r1_w);
xx_store_128(d + 16, v_r2_w);
xx_store_128(d + 24, v_r3_w);
a += 32;
b += 32;
d += 32;
N -= 32;
} while (N);
}
......@@ -124,6 +124,10 @@ endif
ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoiser_sse2.c
endif
ifeq ($(CONFIG_EXT_INTER),yes)
VP10_CX_SRCS-yes += encoder/wedge_utils.c
VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/wedge_utils_sse2.c
endif
VP10_CX_SRCS-$(HAVE_AVX2) += encoder/x86/error_intrin_avx2.c
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment