Commit 26d3d3af authored by Jingning Han's avatar Jingning Han Committed by Gerrit Code Review

Enable 16x16 Hadamard transform in SATD based mode decision

This commit replaces the 16x16 2D-DCT transform with Hadamard
transform for RTC coding mode. It reduces the CPU cycles cost
on 16x16 transform by 5X. Overall it makes the speed -6 encoding
speed 1.5% faster without compromise on compression performance.

Change-Id: If6c993831dc4c678d841edc804ff395ed37f2a1b
parent b4b5af6a
......@@ -1112,8 +1112,8 @@ specialize qw/vp9_avg_4x4 sse2/;
add_proto qw/void vp9_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
specialize qw/vp9_hadamard_8x8 sse2/;
add_proto qw/void vp9_hadamard_16x16/, "int16_t *coeff";
specialize qw/vp9_hadamard_16x16/;
add_proto qw/void vp9_hadamard_16x16/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
specialize qw/vp9_hadamard_16x16 sse2/;
add_proto qw/int16_t vp9_satd/, "const int16_t *coeff, int length";
specialize qw/vp9_satd sse2/;
......
......@@ -78,8 +78,15 @@ void vp9_hadamard_8x8_c(int16_t const *src_diff, int src_stride,
}
// In place 16x16 2D Hadamard transform
void vp9_hadamard_16x16_c(int16_t *coeff) {
void vp9_hadamard_16x16_c(int16_t const *src_diff, int src_stride,
int16_t *coeff) {
int idx;
for (idx = 0; idx < 4; ++idx) {
int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
+ (idx & 0x01) * 8;
vp9_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
}
for (idx = 0; idx < 64; ++idx) {
int16_t a0 = coeff[0];
int16_t a1 = coeff[64];
......
......@@ -375,7 +375,7 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
scan_order->scan, scan_order->iscan);
break;
case TX_16X16:
vp9_fdct16x16(src_diff, coeff, diff_stride);
vp9_hadamard_16x16(src_diff, diff_stride, (int16_t *)coeff);
vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, eob,
......
......@@ -165,6 +165,44 @@ void vp9_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
_mm_storeu_si128((__m128i *)coeff, src[7]);
}
void vp9_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
int16_t *coeff) {
int idx;
for (idx = 0; idx < 4; ++idx) {
int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
+ (idx & 0x01) * 8;
vp9_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
}
for (idx = 0; idx < 64; idx += 8) {
__m128i coeff0 = _mm_load_si128((const __m128i *)coeff);
__m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64));
__m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128));
__m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192));
__m128i b0 = _mm_add_epi16(coeff0, coeff1);
__m128i b1 = _mm_sub_epi16(coeff0, coeff1);
__m128i b2 = _mm_add_epi16(coeff2, coeff3);
__m128i b3 = _mm_sub_epi16(coeff2, coeff3);
coeff0 = _mm_add_epi16(b0, b2);
coeff1 = _mm_add_epi16(b1, b3);
coeff0 = _mm_srai_epi16(coeff0, 1);
coeff1 = _mm_srai_epi16(coeff1, 1);
_mm_store_si128((__m128i *)coeff, coeff0);
_mm_store_si128((__m128i *)(coeff + 64), coeff1);
coeff2 = _mm_sub_epi16(b0, b2);
coeff3 = _mm_sub_epi16(b1, b3);
coeff2 = _mm_srai_epi16(coeff2, 1);
coeff3 = _mm_srai_epi16(coeff3, 1);
_mm_store_si128((__m128i *)(coeff + 128), coeff2);
_mm_store_si128((__m128i *)(coeff + 192), coeff3);
coeff += 8;
}
}
int16_t vp9_satd_sse2(const int16_t *coeff, int length) {
int i;
__m128i sum = _mm_load_si128((const __m128i *)coeff);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment