Commit b6da40ad authored by Yaowu Xu's avatar Yaowu Xu
Browse files

Merge branch 'master' into nextgenv2

Change-Id: I0e4030a37354bb23b3aa8be5cc1473770b9e7b06
parents 236623cf dc9d36c0
......@@ -67,12 +67,22 @@ TEST_P(ErrorBlockTest, OperationCheck) {
int64_t ret;
int64_t ref_ssz;
int64_t ref_ret;
const int msb = bit_depth_ + 8 - 1;
for (int i = 0; i < kNumIterations; ++i) {
int err_count = 0;
block_size = 16 << (i % 9); // All block sizes from 4x4, 8x4 ..64x64
for (int j = 0; j < block_size; j++) {
coeff[j] = rnd(2 << 20) - (1 << 20);
dqcoeff[j] = rnd(2 << 20) - (1 << 20);
// coeff and dqcoeff will always have at least the same sign, and this
// can be used for optimization, so generate test input precisely.
if (rnd(2)) {
// Positive number
coeff[j] = rnd(1 << msb);
dqcoeff[j] = rnd(1 << msb);
} else {
// Negative number
coeff[j] = -rnd(1 << msb);
dqcoeff[j] = -rnd(1 << msb);
}
}
ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz,
bit_depth_);
......@@ -85,7 +95,7 @@ TEST_P(ErrorBlockTest, OperationCheck) {
err_count_total += err_count;
}
EXPECT_EQ(0, err_count_total)
<< "Error: Error Block Test, C output doesn't match SSE2 output. "
<< "Error: Error Block Test, C output doesn't match optimized output. "
<< "First failed at test case " << first_failure;
}
......@@ -100,23 +110,36 @@ TEST_P(ErrorBlockTest, ExtremeValues) {
int64_t ret;
int64_t ref_ssz;
int64_t ref_ret;
int max_val = ((1 << 20) - 1);
const int msb = bit_depth_ + 8 - 1;
int max_val = ((1 << msb) - 1);
for (int i = 0; i < kNumIterations; ++i) {
int err_count = 0;
int k = (i / 9) % 5;
int k = (i / 9) % 9;
// Change the maximum coeff value, to test different bit boundaries
if ( k == 4 && (i % 9) == 0 ) {
if ( k == 8 && (i % 9) == 0 ) {
max_val >>= 1;
}
block_size = 16 << (i % 9); // All block sizes from 4x4, 8x4 ..64x64
for (int j = 0; j < block_size; j++) {
if (k < 4) { // Test at maximum values
coeff[j] = k % 2 ? max_val : -max_val;
dqcoeff[j] = (k >> 1) % 2 ? max_val : -max_val;
if (k < 4) {
// Test at positive maximum values
coeff[j] = k % 2 ? max_val : 0;
dqcoeff[j] = (k >> 1) % 2 ? max_val : 0;
} else if (k < 8) {
// Test at negative maximum values
coeff[j] = k % 2 ? -max_val : 0;
dqcoeff[j] = (k >> 1) % 2 ? -max_val : 0;
} else {
coeff[j] = rnd(2 << 14) - (1 << 14);
dqcoeff[j] = rnd(2 << 14) - (1 << 14);
if (rnd(2)) {
// Positive number
coeff[j] = rnd(1 << 14);
dqcoeff[j] = rnd(1 << 14);
} else {
// Negative number
coeff[j] = -rnd(1 << 14);
dqcoeff[j] = -rnd(1 << 14);
}
}
}
ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz,
......@@ -130,21 +153,13 @@ TEST_P(ErrorBlockTest, ExtremeValues) {
err_count_total += err_count;
}
EXPECT_EQ(0, err_count_total)
<< "Error: Error Block Test, C output doesn't match SSE2 output. "
<< "Error: Error Block Test, C output doesn't match optimized output. "
<< "First failed at test case " << first_failure;
}
using std::tr1::make_tuple;
#if CONFIG_USE_X86INC && HAVE_SSE2
int64_t wrap_vp9_highbd_block_error_8bit_sse2(const tran_low_t *coeff,
const tran_low_t *dqcoeff,
intptr_t block_size,
int64_t *ssz, int bps) {
assert(bps == 8);
return vp9_highbd_block_error_8bit_sse2(coeff, dqcoeff, block_size, ssz);
}
#if CONFIG_USE_X86INC
int64_t wrap_vp9_highbd_block_error_8bit_c(const tran_low_t *coeff,
const tran_low_t *dqcoeff,
intptr_t block_size,
......@@ -153,6 +168,15 @@ int64_t wrap_vp9_highbd_block_error_8bit_c(const tran_low_t *coeff,
return vp9_highbd_block_error_8bit_c(coeff, dqcoeff, block_size, ssz);
}
#if HAVE_SSE2
int64_t wrap_vp9_highbd_block_error_8bit_sse2(const tran_low_t *coeff,
const tran_low_t *dqcoeff,
intptr_t block_size,
int64_t *ssz, int bps) {
assert(bps == 8);
return vp9_highbd_block_error_8bit_sse2(coeff, dqcoeff, block_size, ssz);
}
INSTANTIATE_TEST_CASE_P(
SSE2, ErrorBlockTest,
::testing::Values(
......@@ -165,5 +189,23 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(&wrap_vp9_highbd_block_error_8bit_sse2,
&wrap_vp9_highbd_block_error_8bit_c, VPX_BITS_8)));
#endif // HAVE_SSE2
#if HAVE_AVX
int64_t wrap_vp9_highbd_block_error_8bit_avx(const tran_low_t *coeff,
const tran_low_t *dqcoeff,
intptr_t block_size,
int64_t *ssz, int bps) {
assert(bps == 8);
return vp9_highbd_block_error_8bit_avx(coeff, dqcoeff, block_size, ssz);
}
INSTANTIATE_TEST_CASE_P(
AVX, ErrorBlockTest,
::testing::Values(
make_tuple(&wrap_vp9_highbd_block_error_8bit_avx,
&wrap_vp9_highbd_block_error_8bit_c, VPX_BITS_8)));
#endif // HAVE_AVX
#endif // CONFIG_USE_X86INC
#endif // CONFIG_VP9_HIGHBITDEPTH
} // namespace
......@@ -30,13 +30,13 @@ static void alloc_mode_context(VP10_COMMON *cm, int num_4x4_blk,
for (i = 0; i < MAX_MB_PLANE; ++i) {
for (k = 0; k < 3; ++k) {
CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
vpx_memalign(16, num_pix * sizeof(*ctx->coeff[i][k])));
vpx_memalign(32, num_pix * sizeof(*ctx->coeff[i][k])));
CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k],
vpx_memalign(16, num_pix * sizeof(*ctx->qcoeff[i][k])));
vpx_memalign(32, num_pix * sizeof(*ctx->qcoeff[i][k])));
CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k],
vpx_memalign(16, num_pix * sizeof(*ctx->dqcoeff[i][k])));
vpx_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i][k])));
CHECK_MEM_ERROR(cm, ctx->eobs[i][k],
vpx_memalign(16, num_blk * sizeof(*ctx->eobs[i][k])));
vpx_memalign(32, num_blk * sizeof(*ctx->eobs[i][k])));
ctx->coeff_pbuf[i][k] = ctx->coeff[i][k];
ctx->qcoeff_pbuf[i][k] = ctx->qcoeff[i][k];
ctx->dqcoeff_pbuf[i][k] = ctx->dqcoeff[i][k];
......
......@@ -39,12 +39,20 @@ void vp10_calc_indices(const double *data, const double *centroids,
}
}
// Generate a random number in the range [0, 32768).
static unsigned int lcg_rand16(unsigned int *state) {
*state = *state * 1103515245 + 12345;
return *state / 65536 % 32768;
}
static void calc_centroids(const double *data, double *centroids,
const uint8_t *indices, int n, int k, int dim) {
int i, j, index;
int count[PALETTE_MAX_SIZE];
unsigned int rand_state = data[0];
assert(n <= 32768);
srand((unsigned int) data[0]);
memset(count, 0, sizeof(count[0]) * k);
memset(centroids, 0, sizeof(centroids[0]) * k * dim);
......@@ -59,8 +67,7 @@ static void calc_centroids(const double *data, double *centroids,
for (i = 0; i < k; ++i) {
if (count[i] == 0) {
// TODO(huisu): replace rand() with something else.
memcpy(centroids + i * dim, data + (rand() % n) * dim,
memcpy(centroids + i * dim, data + (lcg_rand16(&rand_state) % n) * dim,
sizeof(centroids[0]) * dim);
} else {
const double norm = 1.0 / count[i];
......
......@@ -512,7 +512,8 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==
SKIP_TXFM_NONE) {
// full forward transform and quantization
vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size);
vp10_xform_quant(x, plane, block, blk_row, blk_col,
plane_bsize, tx_size);
dist_block(x, plane, block, tx_size, &dist, &sse);
} else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==
SKIP_TXFM_AC_ONLY) {
......
......@@ -554,7 +554,6 @@ static void tokenize_b(int plane, int block, int blk_row, int blk_col,
const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
int16_t token;
EXTRABIT extra;
pt = get_entropy_context(tx_size, pd->above_context + blk_col,
pd->left_context + blk_row);
scan = so->scan;
......
......@@ -248,7 +248,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_block_error/, "$sse2_x86inc";
add_proto qw/int64_t vp9_highbd_block_error_8bit/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
specialize qw/vp9_highbd_block_error_8bit/, "$sse2_x86inc";
specialize qw/vp9_highbd_block_error_8bit/, "$sse2_x86inc", "$avx_x86inc";
add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_fp/;
......
......@@ -30,13 +30,13 @@ static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk,
for (i = 0; i < MAX_MB_PLANE; ++i) {
for (k = 0; k < 3; ++k) {
CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
vpx_memalign(16, num_pix * sizeof(*ctx->coeff[i][k])));
vpx_memalign(32, num_pix * sizeof(*ctx->coeff[i][k])));
CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k],
vpx_memalign(16, num_pix * sizeof(*ctx->qcoeff[i][k])));
vpx_memalign(32, num_pix * sizeof(*ctx->qcoeff[i][k])));
CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k],
vpx_memalign(16, num_pix * sizeof(*ctx->dqcoeff[i][k])));
vpx_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i][k])));
CHECK_MEM_ERROR(cm, ctx->eobs[i][k],
vpx_memalign(16, num_blk * sizeof(*ctx->eobs[i][k])));
vpx_memalign(32, num_blk * sizeof(*ctx->eobs[i][k])));
ctx->coeff_pbuf[i][k] = ctx->coeff[i][k];
ctx->qcoeff_pbuf[i][k] = ctx->qcoeff[i][k];
ctx->dqcoeff_pbuf[i][k] = ctx->dqcoeff[i][k];
......
......@@ -21,12 +21,7 @@
#include "vp9/encoder/vp9_denoiser.h"
#include "vp9/encoder/vp9_encoder.h"
/* The VP9 denoiser is a work-in-progress. It currently is only designed to work
* with speed 6, though it (inexplicably) seems to also work with speed 5 (one
* would need to modify the source code in vp9_pickmode.c and vp9_encoder.c to
* make the calls to the vp9_denoiser_* functions when in speed 5).
*
* The implementation is very similar to that of the VP8 denoiser. While
/* The VP9 denoiser is similar to that of the VP8 denoiser. While
* choosing the motion vectors / reference frames, the denoiser is run, and if
* it did not modify the signal to much, the denoised block is copied to the
* signal.
......@@ -328,7 +323,7 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
struct buf_2d src = mb->plane[0].src;
int is_skin = 0;
if (bs <= BLOCK_16X16 && !denoiser->no_denoising) {
if (bs <= BLOCK_16X16 && denoiser->denoising_on) {
// Take center pixel in block to determine is_skin.
const int y_width_shift = (4 << b_width_log2_lookup[bs]) >> 1;
const int y_height_shift = (4 << b_height_log2_lookup[bs]) >> 1;
......@@ -345,7 +340,7 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
is_skin = vp9_skin_pixel(ysource, usource, vsource);
}
if (!denoiser->no_denoising)
if (denoiser->denoising_on)
decision = perform_motion_compensation(denoiser, mb, bs,
denoiser->increase_denoising,
mi_row, mi_col, ctx,
......@@ -528,8 +523,8 @@ void vp9_denoiser_init_noise_estimate(VP9_DENOISER *denoiser,
int height) {
// Denoiser is off by default, i.e., no denoising is performed.
// Noise level is measured periodically, and if observed to be above
// thresh_noise_estimate, then denoising is performed, i.e., no_denoising = 0.
denoiser->no_denoising = 1;
// thresh_noise_estimate, then denoising is performed, i.e., denoising_on = 1.
denoiser->denoising_on = 0;
denoiser->noise_estimate = 0;
denoiser->noise_estimate_count = 0;
denoiser->thresh_noise_estimate = 20;
......@@ -657,9 +652,9 @@ void vp9_denoiser_update_noise_estimate(VP9_COMP *const cpi) {
// Reset counter and check noise level condition.
cpi->denoiser.noise_estimate_count = 0;
if (cpi->denoiser.noise_estimate > cpi->denoiser.thresh_noise_estimate)
cpi->denoiser.no_denoising = 0;
cpi->denoiser.denoising_on = 1;
else
cpi->denoiser.no_denoising = 1;
cpi->denoiser.denoising_on = 0;
}
}
}
......
......@@ -32,7 +32,7 @@ typedef struct vp9_denoiser {
YV12_BUFFER_CONFIG last_source;
int increase_denoising;
int frame_buffer_initialized;
int no_denoising;
int denoising_on;
int noise_estimate;
int thresh_noise_estimate;
int noise_estimate_count;
......
......@@ -1183,10 +1183,13 @@ static int get_twopass_worst_quality(const VP9_COMP *cpi,
double group_weight_factor) {
const RATE_CONTROL *const rc = &cpi->rc;
const VP9EncoderConfig *const oxcf = &cpi->oxcf;
// Clamp the target rate to VBR min / max limts.
const int target_rate =
vp9_rc_clamp_pframe_target_size(cpi, section_target_bandwidth);
inactive_zone = fclamp(inactive_zone, 0.0, 1.0);
if (section_target_bandwidth <= 0) {
if (target_rate <= 0) {
return rc->worst_quality; // Highest value allowed
} else {
const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
......@@ -1195,7 +1198,7 @@ static int get_twopass_worst_quality(const VP9_COMP *cpi,
const double av_err_per_mb = section_err / active_mbs;
const double speed_term = 1.0 + 0.04 * oxcf->speed;
const double ediv_size_correction = (double)num_mbs / EDIV_SIZE_FACTOR;
const int target_norm_bits_per_mb = ((uint64_t)section_target_bandwidth <<
const int target_norm_bits_per_mb = ((uint64_t)target_rate <<
BPER_MB_NORMBITS) / active_mbs;
int q;
......@@ -2444,7 +2447,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
if ((i <= rc->max_gf_interval) ||
((i <= (rc->max_gf_interval * 4)) && (decay_accumulator > 0.5))) {
const double frame_boost =
calc_frame_boost(cpi, this_frame, 0, KF_MAX_BOOST);
calc_frame_boost(cpi, &next_frame, 0, KF_MAX_BOOST);
// How fast is prediction quality decaying.
if (!detect_flash(twopass, 0)) {
......@@ -2737,11 +2740,6 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
}
target_rate = gf_group->bit_allocation[gf_group->index];
if (cpi->common.frame_type == KEY_FRAME)
target_rate = vp9_rc_clamp_iframe_target_size(cpi, target_rate);
else
target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
rc->base_frame_target = target_rate;
{
......
......@@ -1816,6 +1816,11 @@ void vp9_set_target_rate(VP9_COMP *cpi) {
RATE_CONTROL *const rc = &cpi->rc;
int target_rate = rc->base_frame_target;
if (cpi->common.frame_type == KEY_FRAME)
target_rate = vp9_rc_clamp_iframe_target_size(cpi, target_rate);
else
target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
// Correction to rate target based on prior over or under shoot.
if (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.rc_mode == VPX_CQ)
vbr_rate_correction(cpi, &target_rate);
......
......@@ -296,30 +296,11 @@ int64_t vp9_highbd_block_error_8bit_c(const tran_low_t *coeff,
const tran_low_t *dqcoeff,
intptr_t block_size,
int64_t *ssz) {
int i;
int32_t c, d;
int64_t error = 0, sqcoeff = 0;
int16_t diff;
const int32_t hi = 0x00007fff;
const int32_t lo = 0xffff8000;
for (i = 0; i < block_size; i++) {
c = coeff[i];
d = dqcoeff[i];
// Saturate to 16 bits
c = (c > hi) ? hi : ((c < lo) ? lo : c);
d = (d > hi) ? hi : ((d < lo) ? lo : d);
diff = d - c;
error += diff * diff;
sqcoeff += c * c;
}
assert(error >= 0 && sqcoeff >= 0);
*ssz = sqcoeff;
return error;
// Note that the C versions of these 2 functions (vp9_block_error and
// vp9_highbd_block_error_8bit are the same, but the optimized assembly
// routines are not compatible in the non high bitdepth configuration, so
// they still cannot share the same name.
return vp9_block_error_c(coeff, dqcoeff, block_size, ssz);
}
static int64_t vp9_highbd_block_error_dispatch(const tran_low_t *coeff,
......
;
; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%define private_prefix vp9
%include "third_party/x86inc/x86inc.asm"
SECTION .text
ALIGN 16
;
; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff,
; intptr_t block_size, int64_t *ssz)
;
INIT_XMM avx
cglobal highbd_block_error_8bit, 4, 5, 8, uqc, dqc, size, ssz
vzeroupper
; If only one iteration is required, then handle this as a special case.
; It is the most frequent case, so we can have a significant gain here
; by not setting up a loop and accumulators.
cmp sizeq, 16
jne .generic
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Common case of size == 16
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Load input vectors
mova xm0, [dqcq]
packssdw xm0, [dqcq+16]
mova xm2, [uqcq]
packssdw xm2, [uqcq+16]
mova xm1, [dqcq+32]
packssdw xm1, [dqcq+48]
mova xm3, [uqcq+32]
packssdw xm3, [uqcq+48]
; Compute the errors.
psubw xm0, xm2
psubw xm1, xm3
; Individual errors are max 15bit+sign, so squares are 30bit, and
; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
pmaddwd xm2, xm2
pmaddwd xm3, xm3
pmaddwd xm0, xm0
pmaddwd xm1, xm1
; Squares are always positive, so we can use unsigned arithmetic after
; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
; fit in 32bits
paddd xm2, xm3
paddd xm0, xm1
; Accumulate horizontally in 64 bits, there is no chance of overflow here
pxor xm5, xm5
pblendw xm3, xm5, xm2, 0x33 ; Zero extended low of a pair of 32 bits
psrlq xm2, 32 ; Zero extended high of a pair of 32 bits
pblendw xm1, xm5, xm0, 0x33 ; Zero extended low of a pair of 32 bits
psrlq xm0, 32 ; Zero extended high of a pair of 32 bits
paddq xm2, xm3
paddq xm0, xm1
psrldq xm3, xm2, 8
psrldq xm1, xm0, 8
paddq xm2, xm3
paddq xm0, xm1
; Store the return value
%if ARCH_X86_64
movq rax, xm0
movq [sszq], xm2
%else
movd eax, xm0
pextrd edx, xm0, 1
movq [sszd], xm2
%endif
RET
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Generic case of size != 16, speculative low precision
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
ALIGN 16
.generic:
pxor xm4, xm4 ; sse accumulator
pxor xm5, xm5 ; overflow detection register for xm4
pxor xm6, xm6 ; ssz accumulator
pxor xm7, xm7 ; overflow detection register for xm6
lea uqcq, [uqcq+sizeq*4]
lea dqcq, [dqcq+sizeq*4]
neg sizeq
; Push the negative size as the high precision code might need it
push sizeq
.loop:
; Load input vectors
mova xm0, [dqcq+sizeq*4]
packssdw xm0, [dqcq+sizeq*4+16]
mova xm2, [uqcq+sizeq*4]
packssdw xm2, [uqcq+sizeq*4+16]
mova xm1, [dqcq+sizeq*4+32]
packssdw xm1, [dqcq+sizeq*4+48]
mova xm3, [uqcq+sizeq*4+32]
packssdw xm3, [uqcq+sizeq*4+48]
add sizeq, 16
; Compute the squared errors.
; Individual errors are max 15bit+sign, so squares are 30bit, and
; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
psubw xm0, xm2
pmaddwd xm2, xm2
pmaddwd xm0, xm0
psubw xm1, xm3
pmaddwd xm3, xm3
pmaddwd xm1, xm1
; Squares are always positive, so we can use unsigned arithmetic after
; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
; fit in 32bits
paddd xm2, xm3
paddd xm0, xm1
; We accumulate using 32 bit arithmetic, but detect potential overflow
; by checking if the MSB of the accumulators have ever been a set bit.
; If yes, we redo the whole compute at the end on higher precision, but
; this happens extremely rarely, so we still achieve a net gain.
paddd xm4, xm0
paddd xm6, xm2
por xm5, xm4 ; OR in the accumulator for overflow detection
por xm7, xm6 ; OR in the accumulator for overflow detection
jnz .loop
; Add pairs horizontally (still only on 32 bits)
phaddd xm4, xm4
por xm5, xm4 ; OR in the accumulator for overflow detection
phaddd xm6, xm6
por xm7, xm6 ; OR in the accumulator for overflow detection
; Check for possibility of overflow by testing if bit 32 of each dword lane
; have ever been set. If they were not, then there was no overflow and the
; final sum will fit in 32 bits. If overflow happened, then
; we redo the whole computation on higher precision.
por xm7, xm5
pmovmskb r4, xm7
test r4, 0x8888
jnz .highprec
phaddd xm4, xm4
phaddd xm6, xm6
pmovzxdq xm4, xm4
pmovzxdq xm6, xm6
; Restore stack
pop sizeq
; Store the return value
%if ARCH_X86_64
movq rax, xm4
movq [sszq], xm6
%else
movd eax, xm4
pextrd edx, xm4, 1
movq [sszd], xm6
%endif
RET
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Generic case of size != 16, high precision case
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.highprec:
pxor xm4, xm4 ; sse accumulator
pxor xm5, xm5 ; dedicated zero register
pxor xm6, xm6 ; ssz accumulator
pop sizeq
.loophp:
mova xm0, [dqcq+sizeq*4]
packssdw xm0, [dqcq+sizeq*4+16]
mova xm2, [uqcq+sizeq*4]