Commit f65473c0 authored by Yunqing Wang's avatar Yunqing Wang Committed by Gerrit Code Review

Merge "Migrate quantization functions from vp9/ to vpx_dsp/"

parents b0e6811a 38f1fbbb
......@@ -19,7 +19,7 @@
#include "test/register_state_check.h"
#include "test/util.h"
#include "./vpx_config.h"
#include "./vp9_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "vp9/common/vp9_entropy.h"
#include "vp9/common/vp9_scan.h"
#include "vpx/vpx_codec.h"
......
......@@ -56,20 +56,6 @@ static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
return (uint16_t)clamp(val, 0, 4095);
}
}
// Note:
// tran_low_t is the datatype used for final transform coefficients.
// tran_high_t is the datatype used for intermediate transform stages.
typedef int64_t tran_high_t;
typedef int32_t tran_low_t;
#else
// Note:
// tran_low_t is the datatype used for final transform coefficients.
// tran_high_t is the datatype used for intermediate transform stages.
typedef int32_t tran_high_t;
typedef int16_t tran_low_t;
#endif // CONFIG_VP9_HIGHBITDEPTH
#if CONFIG_DEBUG
......
......@@ -781,12 +781,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_fp_32x32/;
add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_b/;
add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_b_32x32/;
add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_fdct8x8_quant/;
} else {
......@@ -802,12 +796,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64_x86inc";
add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_b sse2/, "$ssse3_x86_64_x86inc";
add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64_x86inc";
add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_fdct8x8_quant sse2 ssse3 neon/;
}
......@@ -935,12 +923,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_highbd_quantize_fp_32x32/;
add_proto qw/void vp9_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_highbd_quantize_b sse2/;
add_proto qw/void vp9_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_highbd_quantize_b_32x32 sse2/;
#
# Structured Similarity (SSIM)
#
......
......@@ -13,6 +13,7 @@
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/quantize.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/mem.h"
......@@ -23,7 +24,6 @@
#include "vp9/common/vp9_systemdependent.h"
#include "vp9/encoder/vp9_encodemb.h"
#include "vp9/encoder/vp9_quantize.h"
#include "vp9/encoder/vp9_rd.h"
#include "vp9/encoder/vp9_tokenize.h"
......
......@@ -9,7 +9,7 @@
*/
#include <math.h>
#include "./vpx_dsp_rtcd.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/mem.h"
......@@ -20,113 +20,6 @@
#include "vp9/encoder/vp9_quantize.h"
#include "vp9/encoder/vp9_rd.h"
void vp9_quantize_dc(const tran_low_t *coeff_ptr,
int n_coeffs, int skip_block,
const int16_t *round_ptr, const int16_t quant,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t dequant_ptr, uint16_t *eob_ptr) {
const int rc = 0;
const int coeff = coeff_ptr[rc];
const int coeff_sign = (coeff >> 31);
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
int tmp, eob = -1;
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
if (!skip_block) {
tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
tmp = (tmp * quant) >> 16;
qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;
if (tmp)
eob = 0;
}
*eob_ptr = eob + 1;
}
#if CONFIG_VP9_HIGHBITDEPTH
void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr,
int n_coeffs, int skip_block,
const int16_t *round_ptr, const int16_t quant,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t dequant_ptr, uint16_t *eob_ptr) {
int eob = -1;
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
if (!skip_block) {
const int coeff = coeff_ptr[0];
const int coeff_sign = (coeff >> 31);
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
const int64_t tmp = abs_coeff + round_ptr[0];
const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 16);
qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr;
if (abs_qcoeff)
eob = 0;
}
*eob_ptr = eob + 1;
}
#endif
void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
const int16_t *round_ptr, const int16_t quant,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t dequant_ptr, uint16_t *eob_ptr) {
const int n_coeffs = 1024;
const int rc = 0;
const int coeff = coeff_ptr[rc];
const int coeff_sign = (coeff >> 31);
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
int tmp, eob = -1;
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
if (!skip_block) {
tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
INT16_MIN, INT16_MAX);
tmp = (tmp * quant) >> 15;
qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
if (tmp)
eob = 0;
}
*eob_ptr = eob + 1;
}
#if CONFIG_VP9_HIGHBITDEPTH
void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
int skip_block,
const int16_t *round_ptr,
const int16_t quant,
tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr,
const int16_t dequant_ptr,
uint16_t *eob_ptr) {
const int n_coeffs = 1024;
int eob = -1;
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
if (!skip_block) {
const int coeff = coeff_ptr[0];
const int coeff_sign = (coeff >> 31);
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1);
const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 15);
qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2;
if (abs_qcoeff)
eob = 0;
}
*eob_ptr = eob + 1;
}
#endif
void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
int skip_block,
const int16_t *zbin_ptr, const int16_t *round_ptr,
......@@ -298,224 +191,6 @@ void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr,
}
#endif
void vp9_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
int skip_block,
const int16_t *zbin_ptr, const int16_t *round_ptr,
const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
int i, non_zero_count = (int)n_coeffs, eob = -1;
const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
(void)iscan;
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
if (!skip_block) {
// Pre-scan pass
for (i = (int)n_coeffs - 1; i >= 0; i--) {
const int rc = scan[i];
const int coeff = coeff_ptr[rc];
if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
non_zero_count--;
else
break;
}
// Quantization pass: All coefficients with index >= zero_flag are
// skippable. Note: zero_flag can be zero.
for (i = 0; i < non_zero_count; i++) {
const int rc = scan[i];
const int coeff = coeff_ptr[rc];
const int coeff_sign = (coeff >> 31);
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
if (abs_coeff >= zbins[rc != 0]) {
int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
quant_shift_ptr[rc != 0]) >> 16; // quantization
qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
if (tmp)
eob = i;
}
}
}
*eob_ptr = eob + 1;
}
#if CONFIG_VP9_HIGHBITDEPTH
void vp9_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
int skip_block, const int16_t *zbin_ptr,
const int16_t *round_ptr, const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
uint16_t *eob_ptr, const int16_t *scan,
const int16_t *iscan) {
int i, non_zero_count = (int)n_coeffs, eob = -1;
const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
(void)iscan;
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
if (!skip_block) {
// Pre-scan pass
for (i = (int)n_coeffs - 1; i >= 0; i--) {
const int rc = scan[i];
const int coeff = coeff_ptr[rc];
if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
non_zero_count--;
else
break;
}
// Quantization pass: All coefficients with index >= zero_flag are
// skippable. Note: zero_flag can be zero.
for (i = 0; i < non_zero_count; i++) {
const int rc = scan[i];
const int coeff = coeff_ptr[rc];
const int coeff_sign = (coeff >> 31);
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
if (abs_coeff >= zbins[rc != 0]) {
const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];
const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
const uint32_t abs_qcoeff =
(uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 16);
qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
if (abs_qcoeff)
eob = i;
}
}
}
*eob_ptr = eob + 1;
}
#endif
void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
int skip_block,
const int16_t *zbin_ptr, const int16_t *round_ptr,
const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};
const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
int idx = 0;
int idx_arr[1024];
int i, eob = -1;
(void)iscan;
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
if (!skip_block) {
// Pre-scan pass
for (i = 0; i < n_coeffs; i++) {
const int rc = scan[i];
const int coeff = coeff_ptr[rc];
// If the coefficient is out of the base ZBIN range, keep it for
// quantization.
if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
idx_arr[idx++] = i;
}
// Quantization pass: only process the coefficients selected in
// pre-scan pass. Note: idx can be zero.
for (i = 0; i < idx; i++) {
const int rc = scan[idx_arr[i]];
const int coeff = coeff_ptr[rc];
const int coeff_sign = (coeff >> 31);
int tmp;
int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *
quant_shift_ptr[rc != 0]) >> 15;
qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
if (tmp)
eob = idx_arr[i];
}
}
*eob_ptr = eob + 1;
}
#if CONFIG_VP9_HIGHBITDEPTH
void vp9_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
intptr_t n_coeffs, int skip_block,
const int16_t *zbin_ptr,
const int16_t *round_ptr,
const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};
const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
int idx = 0;
int idx_arr[1024];
int i, eob = -1;
(void)iscan;
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
if (!skip_block) {
// Pre-scan pass
for (i = 0; i < n_coeffs; i++) {
const int rc = scan[i];
const int coeff = coeff_ptr[rc];
// If the coefficient is out of the base ZBIN range, keep it for
// quantization.
if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
idx_arr[idx++] = i;
}
// Quantization pass: only process the coefficients selected in
// pre-scan pass. Note: idx can be zero.
for (i = 0; i < idx; i++) {
const int rc = scan[idx_arr[i]];
const int coeff = coeff_ptr[rc];
const int coeff_sign = (coeff >> 31);
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
const int64_t tmp1 = abs_coeff
+ ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
const uint32_t abs_qcoeff =
(uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
if (abs_qcoeff)
eob = idx_arr[i];
}
}
*eob_ptr = eob + 1;
}
#endif
void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
const int16_t *scan, const int16_t *iscan) {
MACROBLOCKD *const xd = &x->e_mbd;
......
......@@ -37,34 +37,9 @@ typedef struct {
DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
} QUANTS;
void vp9_quantize_dc(const tran_low_t *coeff_ptr,
int n_coeffs, int skip_block,
const int16_t *round_ptr, const int16_t quant_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t dequant_ptr, uint16_t *eob_ptr);
void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
const int16_t *round_ptr, const int16_t quant_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t dequant_ptr, uint16_t *eob_ptr);
void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
const int16_t *scan, const int16_t *iscan);
#if CONFIG_VP9_HIGHBITDEPTH
void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr,
int n_coeffs, int skip_block,
const int16_t *round_ptr, const int16_t quant_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t dequant_ptr, uint16_t *eob_ptr);
void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
int skip_block,
const int16_t *round_ptr,
const int16_t quant_ptr,
tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr,
const int16_t dequant_ptr,
uint16_t *eob_ptr);
#endif
struct VP9_COMP;
struct VP9Common;
......
......@@ -14,214 +14,6 @@
#include "./vp9_rtcd.h"
#include "vpx/vpx_integer.h"
void vp9_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
int skip_block, const int16_t* zbin_ptr,
const int16_t* round_ptr, const int16_t* quant_ptr,
const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
uint16_t* eob_ptr,
const int16_t* scan_ptr,
const int16_t* iscan_ptr) {
__m128i zero;
(void)scan_ptr;
coeff_ptr += n_coeffs;
iscan_ptr += n_coeffs;
qcoeff_ptr += n_coeffs;
dqcoeff_ptr += n_coeffs;
n_coeffs = -n_coeffs;
zero = _mm_setzero_si128();
if (!skip_block) {
__m128i eob;
__m128i zbin;
__m128i round, quant, dequant, shift;
{
__m128i coeff0, coeff1;
// Setup global values
{
__m128i pw_1;
zbin = _mm_load_si128((const __m128i*)zbin_ptr);
round = _mm_load_si128((const __m128i*)round_ptr);
quant = _mm_load_si128((const __m128i*)quant_ptr);
pw_1 = _mm_set1_epi16(1);
zbin = _mm_sub_epi16(zbin, pw_1);
dequant = _mm_load_si128((const __m128i*)dequant_ptr);
shift = _mm_load_si128((const __m128i*)quant_shift_ptr);
}
{
__m128i coeff0_sign, coeff1_sign;
__m128i qcoeff0, qcoeff1;
__m128i qtmp0, qtmp1;
__m128i cmp_mask0, cmp_mask1;
// Do DC and first 15 AC
coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
// Poor man's sign extract
coeff0_sign = _mm_srai_epi16(coeff0, 15);
coeff1_sign = _mm_srai_epi16(coeff1, 15);
qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
qcoeff0 = _mm_adds_epi16(qcoeff0, round);
round = _mm_unpackhi_epi64(round, round);
qcoeff1 = _mm_adds_epi16(qcoeff1, round);
qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
quant = _mm_unpackhi_epi64(quant, quant);
qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
shift = _mm_unpackhi_epi64(shift, shift);
qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
// Reinsert signs
qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
// Mask out zbin threshold coeffs
qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
_mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
_mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
dequant = _mm_unpackhi_epi64(dequant, dequant);
coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
_mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
_mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
}
{
// Scan for eob
__m128i zero_coeff0, zero_coeff1;
__m128i nzero_coeff0, nzero_coeff1;
__m128i iscan0, iscan1;
__m128i eob1;
zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
// Add one to convert from indices to counts
iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
eob = _mm_and_si128(iscan0, nzero_coeff0);
eob1 = _mm_and_si128(iscan1, nzero_coeff1);
eob = _mm_max_epi16(eob, eob1);
}
n_coeffs += 8 * 2;
}
// AC only loop
while (n_coeffs < 0) {
__m128i coeff0, coeff1;
{
__m128i coeff0_sign, coeff1_sign;
__m128i qcoeff0, qcoeff1;
__m128i qtmp0, qtmp1;
__m128i cmp_mask0, cmp_mask1;
coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
// Poor man's sign extract
coeff0_sign = _mm_srai_epi16(coeff0, 15);
coeff1_sign = _mm_srai_epi16(coeff1, 15);
qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
qcoeff0 = _mm_adds_epi16(qcoeff0, round);
qcoeff1 = _mm_adds_epi16(qcoeff1, round);
qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);