Commit eed1bade authored by Jingning Han's avatar Jingning Han
Browse files

Optimize quantization simd implementation

This commit allows the quantizer to compare the AC coefficients to
the quantization step size to determine if further multiplication
operations are needed. It makes the quantization process 20% faster
without coding statistics change.

Change-Id: I735aaf6a9c0874c82175bb565b20e131464db64a
parent 502ac722
......@@ -293,7 +293,8 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride,
if (!skip_block) {
__m128i eob;
__m128i round, quant, dequant;
__m128i round, quant, dequant, thr;
int16_t nzflag;
{
__m128i coeff0, coeff1;
......@@ -368,6 +369,7 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride,
// AC only loop
index = 2;
thr = _mm_srai_epi16(dequant, 1);
while (n_coeffs < 0) {
__m128i coeff0, coeff1;
{
......@@ -387,28 +389,39 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride,
qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
qcoeff0 = _mm_adds_epi16(qcoeff0, round);
qcoeff1 = _mm_adds_epi16(qcoeff1, round);
qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
_mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
// Reinsert signs
qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
if (nzflag) {
qcoeff0 = _mm_adds_epi16(qcoeff0, round);
qcoeff1 = _mm_adds_epi16(qcoeff1, round);
qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
_mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
_mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
// Reinsert signs
qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
_mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
_mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
_mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
_mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
_mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
_mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
} else {
_mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
_mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
_mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
_mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
}
}
{
if (nzflag) {
// Scan for eob
__m128i zero_coeff0, zero_coeff1;
__m128i nzero_coeff0, nzero_coeff1;
......
......@@ -230,6 +230,8 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
const int16_t* scan_ptr,
const int16_t* iscan_ptr) {
__m128i zero;
__m128i thr;
int16_t nzflag;
(void)scan_ptr;
(void)zbin_ptr;
(void)quant_shift_ptr;
......@@ -316,6 +318,8 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
n_coeffs += 8 * 2;
}
thr = _mm_srai_epi16(dequant, 1);
// AC only loop
while (n_coeffs < 0) {
__m128i coeff0, coeff1;
......@@ -335,28 +339,39 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
qcoeff0 = _mm_adds_epi16(qcoeff0, round);
qcoeff1 = _mm_adds_epi16(qcoeff1, round);
qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
_mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
// Reinsert signs
qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
if (nzflag) {
qcoeff0 = _mm_adds_epi16(qcoeff0, round);
qcoeff1 = _mm_adds_epi16(qcoeff1, round);
qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
_mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
_mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
// Reinsert signs
qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
_mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
_mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
_mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
_mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
_mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
_mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
} else {
_mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
_mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
_mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
_mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
}
}
{
if (nzflag) {
// Scan for eob
__m128i zero_coeff0, zero_coeff1;
__m128i nzero_coeff0, nzero_coeff1;
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment