Commit 135d6631 authored by Geza Lore's avatar Geza Lore

Reinstate "Optimize wedge partition selection." without tests.

This reinstates commit efda2831
without the tests and with fixes for 32 bit x86 builds.

Change-Id: I34be4fe1e8a67686d26ba256fd7efe0eb6a569e8
parent 52141c91
......@@ -2447,7 +2447,6 @@ static void build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, int plane,
int wedge_offset_x,
int wedge_offset_y,
#endif // CONFIG_SUPERTX
int mi_x, int mi_y,
uint8_t *ext_dst0,
int ext_dst_stride0,
uint8_t *ext_dst1,
......@@ -2461,8 +2460,6 @@ static void build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, int plane,
(void) block;
(void) bw;
(void) bh;
(void) mi_x;
(void) mi_y;
if (is_compound
&& is_interinter_wedge_used(mbmi->sb_type)
......@@ -2526,12 +2523,9 @@ static void build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, int plane,
void vp10_build_wedge_inter_predictor_from_buf(
MACROBLOCKD *xd, BLOCK_SIZE bsize,
int plane_from, int plane_to,
int mi_row, int mi_col,
uint8_t *ext_dst0[3], int ext_dst_stride0[3],
uint8_t *ext_dst1[3], int ext_dst_stride1[3]) {
int plane;
const int mi_x = mi_col * MI_SIZE;
const int mi_y = mi_row * MI_SIZE;
for (plane = plane_from; plane <= plane_to; ++plane) {
const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
&xd->plane[plane]);
......@@ -2550,7 +2544,6 @@ void vp10_build_wedge_inter_predictor_from_buf(
#if CONFIG_SUPERTX
0, 0,
#endif
mi_x, mi_y,
ext_dst0[plane],
ext_dst_stride0[plane],
ext_dst1[plane],
......@@ -2561,7 +2554,6 @@ void vp10_build_wedge_inter_predictor_from_buf(
#if CONFIG_SUPERTX
0, 0,
#endif
mi_x, mi_y,
ext_dst0[plane],
ext_dst_stride0[plane],
ext_dst1[plane],
......
......@@ -652,7 +652,6 @@ void vp10_build_inter_predictors_for_planes_single_buf(
void vp10_build_wedge_inter_predictor_from_buf(
MACROBLOCKD *xd, BLOCK_SIZE bsize,
int plane_from, int plane_to,
int mi_row, int mi_col,
uint8_t *ext_dst0[3], int ext_dst_stride0[3],
uint8_t *ext_dst1[3], int ext_dst_stride1[3]);
#endif // CONFIG_EXT_INTER
......
......@@ -725,6 +725,15 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
}
# End vp10_high encoder functions
if (vpx_config("CONFIG_EXT_INTER") eq "yes") {
add_proto qw/uint64_t vp10_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N";
specialize qw/vp10_wedge_sse_from_residuals sse2/;
add_proto qw/int vp10_wedge_sign_from_residuals/, "const int16_t *ds, const uint8_t *m, int N, int64_t limit";
specialize qw/vp10_wedge_sign_from_residuals sse2/;
add_proto qw/void vp10_wedge_compute_delta_squares/, "int16_t *d, const int16_t *a, const int16_t *b, int N";
specialize qw/vp10_wedge_compute_delta_squares sse2/;
}
}
# end encoder functions
1;
This diff is collapsed.
/*
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <assert.h>
#include "vpx/vpx_integer.h"
#include "vpx_ports/mem.h"
#include "vpx_dsp/vpx_dsp_common.h"
#include "vp10/common/reconinter.h"
#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
/**
* Computes SSE of a compound predictor constructed from 2 fundamental
* predictors p0 and p1 using blending with mask.
*
* r1: Residuals of p1.
* (source - p1)
* d: Difference of p1 and p0.
* (p1 - p0)
* m: The blending mask
* N: Number of pixels
*
* 'r1', 'd', and 'm' are contiguous.
*
* Computes:
* Sum((MAX_MASK_VALUE*r1 + mask*d)**2), which is equivalent to:
* Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2),
* where r0 is (source - p0), and r1 is (source - p1), which is in turn
* is equivalent to:
* Sum((source*MAX_MASK_VALUE - (mask*p0 + (MAX_MASK_VALUE-mask)*p1))**2),
* which is the SSE of the residuals of the compound predictor scaled up by
* MAX_MASK_VALUE**2.
*
* Note that we clamp the partial term in the loop to 16 bits signed. This is
* to facilitate equivalent SIMD implementation. It should have no effect if
* residuals are within 16 - WEDGE_WEIGHT_BITS (=10) signed, which always
* holds for 8 bit input, and on real input, it should hold practically always,
* as residuals are expected to be small.
*/
uint64_t vp10_wedge_sse_from_residuals_c(const int16_t *r1,
const int16_t *d,
const uint8_t *m,
int N) {
uint64_t csse = 0;
int i;
assert(N % 64 == 0);
for (i = 0 ; i < N ; i++) {
int32_t t = MAX_MASK_VALUE*r1[i] + m[i]*d[i];
t = clamp(t, INT16_MIN, INT16_MAX);
csse += t*t;
}
return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
}
/**
* Choose the mask sign for a compound predictor.
*
* ds: Difference of the squares of the residuals.
* r0**2 - r1**2
* m: The blending mask
* N: Number of pixels
* limit: Pre-computed threshold value.
* MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2))
*
* 'ds' and 'm' are contiguous.
*
* Returns true if the negated mask has lower SSE compared to the positive
* mask. Computation is based on:
* Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2)
* >
* Sum(((MAX_MASK_VALUE-mask)*r0 + mask*r1)**2)
*
* which can be simplified to:
*
* Sum(mask*(r0**2 - r1**2)) > MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2))
*
* The right hand side does not depend on the mask, and needs to be passed as
* the 'limit' parameter.
*
* After pre-computing (r0**2 - r1**2), which is passed in as 'ds', the left
* hand side is simply a scalar product between an int16_t and uint8_t vector.
*
* Note that for efficiency, ds is stored on 16 bits. Real input residuals
* being small, this should not cause a noticeable issue.
*/
int vp10_wedge_sign_from_residuals_c(const int16_t *ds,
const uint8_t *m,
int N,
int64_t limit) {
int64_t acc = 0;
assert(N % 64 == 0);
do {
acc += *ds++ * *m++;
} while (--N);
return acc > limit;
}
/**
* Compute the element-wise difference of the squares of 2 arrays.
*
* d: Difference of the squares of the inputs: a**2 - b**2
* a: First input array
* b: Second input array
* N: Number of elements
*
* 'd', 'a', and 'b' are contiguous.
*
* The result is saturated to signed 16 bits.
*/
void vp10_wedge_compute_delta_squares_c(int16_t *d,
const int16_t *a,
const int16_t *b,
int N) {
int i;
assert(N % 64 == 0);
for (i = 0 ; i < N ; i++)
d[i] = clamp(a[i]*a[i] - b[i]*b[i], INT16_MIN, INT16_MAX);
}
/*
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <assert.h>
#include <immintrin.h>
#include "vpx_dsp/x86/synonyms.h"
#include "vpx/vpx_integer.h"
#include "vp10/common/reconinter.h"
#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
/**
* See vp10_wedge_sse_from_residuals_c
*/
uint64_t vp10_wedge_sse_from_residuals_sse2(const int16_t *r1,
const int16_t *d,
const uint8_t *m,
int N) {
int n = -N;
int n8 = n + 8;
uint64_t csse;
const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE);
const __m128i v_zext_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
__m128i v_acc0_q = _mm_setzero_si128();
assert(N % 64 == 0);
r1 += N;
d += N;
m += N;
do {
const __m128i v_r0_w = xx_load_128(r1 + n);
const __m128i v_r1_w = xx_load_128(r1 + n8);
const __m128i v_d0_w = xx_load_128(d + n);
const __m128i v_d1_w = xx_load_128(d + n8);
const __m128i v_m01_b = xx_load_128(m + n);
const __m128i v_rd0l_w = _mm_unpacklo_epi16(v_d0_w, v_r0_w);
const __m128i v_rd0h_w = _mm_unpackhi_epi16(v_d0_w, v_r0_w);
const __m128i v_rd1l_w = _mm_unpacklo_epi16(v_d1_w, v_r1_w);
const __m128i v_rd1h_w = _mm_unpackhi_epi16(v_d1_w, v_r1_w);
const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128());
const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128());
const __m128i v_m0l_w = _mm_unpacklo_epi16(v_m0_w, v_mask_max_w);
const __m128i v_m0h_w = _mm_unpackhi_epi16(v_m0_w, v_mask_max_w);
const __m128i v_m1l_w = _mm_unpacklo_epi16(v_m1_w, v_mask_max_w);
const __m128i v_m1h_w = _mm_unpackhi_epi16(v_m1_w, v_mask_max_w);
const __m128i v_t0l_d = _mm_madd_epi16(v_rd0l_w, v_m0l_w);
const __m128i v_t0h_d = _mm_madd_epi16(v_rd0h_w, v_m0h_w);
const __m128i v_t1l_d = _mm_madd_epi16(v_rd1l_w, v_m1l_w);
const __m128i v_t1h_d = _mm_madd_epi16(v_rd1h_w, v_m1h_w);
const __m128i v_t0_w = _mm_packs_epi32(v_t0l_d, v_t0h_d);
const __m128i v_t1_w = _mm_packs_epi32(v_t1l_d, v_t1h_d);
const __m128i v_sq0_d = _mm_madd_epi16(v_t0_w, v_t0_w);
const __m128i v_sq1_d = _mm_madd_epi16(v_t1_w, v_t1_w);
const __m128i v_sum0_q = _mm_add_epi64(_mm_and_si128(v_sq0_d, v_zext_q),
_mm_srli_epi64(v_sq0_d, 32));
const __m128i v_sum1_q = _mm_add_epi64(_mm_and_si128(v_sq1_d, v_zext_q),
_mm_srli_epi64(v_sq1_d, 32));
v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum0_q);
v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum1_q);
n8 += 16;
n += 16;
} while (n);
v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));
#if ARCH_X86_64
csse = (uint64_t)_mm_cvtsi128_si64(v_acc0_q);
#else
xx_storel_64(&csse, v_acc0_q);
#endif
return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
}
/**
* See vp10_wedge_sign_from_residuals_c
*/
int vp10_wedge_sign_from_residuals_sse2(const int16_t *ds,
const uint8_t *m,
int N,
int64_t limit) {
int64_t acc;
__m128i v_sign_d;
__m128i v_acc0_d = _mm_setzero_si128();
__m128i v_acc1_d = _mm_setzero_si128();
__m128i v_acc_q;
// Input size limited to 8192 by the use of 32 bit accumulators and m
// being between [0, 64]. Overflow might happen at larger sizes,
// though it is practically impossible on real video input.
assert(N < 8192);
assert(N % 64 == 0);
do {
const __m128i v_m01_b = xx_load_128(m);
const __m128i v_m23_b = xx_load_128(m + 16);
const __m128i v_m45_b = xx_load_128(m + 32);
const __m128i v_m67_b = xx_load_128(m + 48);
const __m128i v_d0_w = xx_load_128(ds);
const __m128i v_d1_w = xx_load_128(ds + 8);
const __m128i v_d2_w = xx_load_128(ds + 16);
const __m128i v_d3_w = xx_load_128(ds + 24);
const __m128i v_d4_w = xx_load_128(ds + 32);
const __m128i v_d5_w = xx_load_128(ds + 40);
const __m128i v_d6_w = xx_load_128(ds + 48);
const __m128i v_d7_w = xx_load_128(ds + 56);
const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128());
const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128());
const __m128i v_m2_w = _mm_unpacklo_epi8(v_m23_b, _mm_setzero_si128());
const __m128i v_m3_w = _mm_unpackhi_epi8(v_m23_b, _mm_setzero_si128());
const __m128i v_m4_w = _mm_unpacklo_epi8(v_m45_b, _mm_setzero_si128());
const __m128i v_m5_w = _mm_unpackhi_epi8(v_m45_b, _mm_setzero_si128());
const __m128i v_m6_w = _mm_unpacklo_epi8(v_m67_b, _mm_setzero_si128());
const __m128i v_m7_w = _mm_unpackhi_epi8(v_m67_b, _mm_setzero_si128());
const __m128i v_p0_d = _mm_madd_epi16(v_d0_w, v_m0_w);
const __m128i v_p1_d = _mm_madd_epi16(v_d1_w, v_m1_w);
const __m128i v_p2_d = _mm_madd_epi16(v_d2_w, v_m2_w);
const __m128i v_p3_d = _mm_madd_epi16(v_d3_w, v_m3_w);
const __m128i v_p4_d = _mm_madd_epi16(v_d4_w, v_m4_w);
const __m128i v_p5_d = _mm_madd_epi16(v_d5_w, v_m5_w);
const __m128i v_p6_d = _mm_madd_epi16(v_d6_w, v_m6_w);
const __m128i v_p7_d = _mm_madd_epi16(v_d7_w, v_m7_w);
const __m128i v_p01_d = _mm_add_epi32(v_p0_d, v_p1_d);
const __m128i v_p23_d = _mm_add_epi32(v_p2_d, v_p3_d);
const __m128i v_p45_d = _mm_add_epi32(v_p4_d, v_p5_d);
const __m128i v_p67_d = _mm_add_epi32(v_p6_d, v_p7_d);
const __m128i v_p0123_d = _mm_add_epi32(v_p01_d, v_p23_d);
const __m128i v_p4567_d = _mm_add_epi32(v_p45_d, v_p67_d);
v_acc0_d = _mm_add_epi32(v_acc0_d, v_p0123_d);
v_acc1_d = _mm_add_epi32(v_acc1_d, v_p4567_d);
ds += 64;
m += 64;
N -= 64;
} while (N);
v_sign_d = _mm_cmplt_epi32(v_acc0_d, _mm_setzero_si128());
v_acc0_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc0_d, v_sign_d),
_mm_unpackhi_epi32(v_acc0_d, v_sign_d));
v_sign_d = _mm_cmplt_epi32(v_acc1_d, _mm_setzero_si128());
v_acc1_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc1_d, v_sign_d),
_mm_unpackhi_epi32(v_acc1_d, v_sign_d));
v_acc_q = _mm_add_epi64(v_acc0_d, v_acc1_d);
v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
#if ARCH_X86_64
acc = (uint64_t)_mm_cvtsi128_si64(v_acc_q);
#else
xx_storel_64(&acc, v_acc_q);
#endif
return acc > limit;
}
// Negate under mask
static INLINE __m128i negm_epi16(__m128i v_v_w, __m128i v_mask_w) {
return _mm_sub_epi16(_mm_xor_si128(v_v_w, v_mask_w), v_mask_w);
}
/**
* vp10_wedge_compute_delta_squares_c
*/
void vp10_wedge_compute_delta_squares_sse2(int16_t *d,
const int16_t *a,
const int16_t *b,
int N) {
const __m128i v_neg_w = _mm_set_epi16(0xffff, 0, 0xffff, 0,
0xffff, 0, 0xffff, 0);
assert(N % 64 == 0);
do {
const __m128i v_a0_w = xx_load_128(a);
const __m128i v_b0_w = xx_load_128(b);
const __m128i v_a1_w = xx_load_128(a + 8);
const __m128i v_b1_w = xx_load_128(b + 8);
const __m128i v_a2_w = xx_load_128(a + 16);
const __m128i v_b2_w = xx_load_128(b + 16);
const __m128i v_a3_w = xx_load_128(a + 24);
const __m128i v_b3_w = xx_load_128(b + 24);
const __m128i v_ab0l_w = _mm_unpacklo_epi16(v_a0_w, v_b0_w);
const __m128i v_ab0h_w = _mm_unpackhi_epi16(v_a0_w, v_b0_w);
const __m128i v_ab1l_w = _mm_unpacklo_epi16(v_a1_w, v_b1_w);
const __m128i v_ab1h_w = _mm_unpackhi_epi16(v_a1_w, v_b1_w);
const __m128i v_ab2l_w = _mm_unpacklo_epi16(v_a2_w, v_b2_w);
const __m128i v_ab2h_w = _mm_unpackhi_epi16(v_a2_w, v_b2_w);
const __m128i v_ab3l_w = _mm_unpacklo_epi16(v_a3_w, v_b3_w);
const __m128i v_ab3h_w = _mm_unpackhi_epi16(v_a3_w, v_b3_w);
// Negate top word of pairs
const __m128i v_abl0n_w = negm_epi16(v_ab0l_w, v_neg_w);
const __m128i v_abh0n_w = negm_epi16(v_ab0h_w, v_neg_w);
const __m128i v_abl1n_w = negm_epi16(v_ab1l_w, v_neg_w);
const __m128i v_abh1n_w = negm_epi16(v_ab1h_w, v_neg_w);
const __m128i v_abl2n_w = negm_epi16(v_ab2l_w, v_neg_w);
const __m128i v_abh2n_w = negm_epi16(v_ab2h_w, v_neg_w);
const __m128i v_abl3n_w = negm_epi16(v_ab3l_w, v_neg_w);
const __m128i v_abh3n_w = negm_epi16(v_ab3h_w, v_neg_w);
const __m128i v_r0l_w = _mm_madd_epi16(v_ab0l_w, v_abl0n_w);
const __m128i v_r0h_w = _mm_madd_epi16(v_ab0h_w, v_abh0n_w);
const __m128i v_r1l_w = _mm_madd_epi16(v_ab1l_w, v_abl1n_w);
const __m128i v_r1h_w = _mm_madd_epi16(v_ab1h_w, v_abh1n_w);
const __m128i v_r2l_w = _mm_madd_epi16(v_ab2l_w, v_abl2n_w);
const __m128i v_r2h_w = _mm_madd_epi16(v_ab2h_w, v_abh2n_w);
const __m128i v_r3l_w = _mm_madd_epi16(v_ab3l_w, v_abl3n_w);
const __m128i v_r3h_w = _mm_madd_epi16(v_ab3h_w, v_abh3n_w);
const __m128i v_r0_w = _mm_packs_epi32(v_r0l_w, v_r0h_w);
const __m128i v_r1_w = _mm_packs_epi32(v_r1l_w, v_r1h_w);
const __m128i v_r2_w = _mm_packs_epi32(v_r2l_w, v_r2h_w);
const __m128i v_r3_w = _mm_packs_epi32(v_r3l_w, v_r3h_w);
xx_store_128(d, v_r0_w);
xx_store_128(d + 8, v_r1_w);
xx_store_128(d + 16, v_r2_w);
xx_store_128(d + 24, v_r3_w);
a += 32;
b += 32;
d += 32;
N -= 32;
} while (N);
}
......@@ -124,6 +124,10 @@ endif
ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoiser_sse2.c
endif
ifeq ($(CONFIG_EXT_INTER),yes)
VP10_CX_SRCS-yes += encoder/wedge_utils.c
VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/wedge_utils_sse2.c
endif
VP10_CX_SRCS-$(HAVE_AVX2) += encoder/x86/error_intrin_avx2.c
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment