Commit 8c411f74 authored by Jingning Han's avatar Jingning Han

Hadamard transform based coding mode decision process

This commit uses Hadamard transform based rate-distortion cost
estimate for rtc coding mode decision. It improves the compression
performance of speed -6 for many hard clips at lower bit-rates.
For example, 5.5% for jimredvga, 6.7% for mmmoving, 6.1% for
niklas720p. This will introduce extra encoding cycle costs at
this point.

Change-Id: Iaf70634fa2417a705ee29f2456175b981db3d375
parent ba13ff85
......@@ -1109,6 +1109,15 @@ specialize qw/vp9_avg_8x8 sse2 neon/;
add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p";
specialize qw/vp9_avg_4x4 sse2/;
add_proto qw/void vp9_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
specialize qw/vp9_hadamard_8x8 sse2/;
add_proto qw/void vp9_hadamard_16x16/, "int16_t *coeff";
specialize qw/vp9_hadamard_16x16/;
add_proto qw/int16_t vp9_satd/, "const int16_t *coeff, int length";
specialize qw/vp9_satd sse2/;
add_proto qw/void vp9_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height";
specialize qw/vp9_int_pro_row sse2/;
......
......@@ -28,6 +28,87 @@ unsigned int vp9_avg_4x4_c(const uint8_t *s, int p) {
return (sum + 8) >> 4;
}
static void hadamard_col8(const int16_t *src_diff, int src_stride,
int16_t *coeff) {
int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
int16_t c0 = b0 + b2;
int16_t c1 = b1 + b3;
int16_t c2 = b0 - b2;
int16_t c3 = b1 - b3;
int16_t c4 = b4 + b6;
int16_t c5 = b5 + b7;
int16_t c6 = b4 - b6;
int16_t c7 = b5 - b7;
coeff[0] = c0 + c4;
coeff[7] = c1 + c5;
coeff[3] = c2 + c6;
coeff[4] = c3 + c7;
coeff[2] = c0 - c4;
coeff[6] = c1 - c5;
coeff[1] = c2 - c6;
coeff[5] = c3 - c7;
}
void vp9_hadamard_8x8_c(int16_t const *src_diff, int src_stride,
int16_t *coeff) {
int idx;
int16_t buffer[64];
int16_t *tmp_buf = &buffer[0];
for (idx = 0; idx < 8; ++idx) {
hadamard_col8(src_diff, src_stride, tmp_buf);
tmp_buf += 8;
++src_diff;
}
tmp_buf = &buffer[0];
for (idx = 0; idx < 8; ++idx) {
hadamard_col8(tmp_buf, 8, coeff);
coeff += 8;
++tmp_buf;
}
}
// In place 16x16 2D Hadamard transform
void vp9_hadamard_16x16_c(int16_t *coeff) {
int idx;
for (idx = 0; idx < 64; ++idx) {
int16_t a0 = coeff[0];
int16_t a1 = coeff[64];
int16_t a2 = coeff[128];
int16_t a3 = coeff[192];
int16_t b0 = a0 + a1;
int16_t b1 = a0 - a1;
int16_t b2 = a2 + a3;
int16_t b3 = a2 - a3;
coeff[0] = (b0 + b2) >> 1;
coeff[64] = (b1 + b3) >> 1;
coeff[128] = (b0 - b2) >> 1;
coeff[192] = (b1 - b3) >> 1;
++coeff;
}
}
int16_t vp9_satd_c(const int16_t *coeff, int length) {
int i;
int satd = 0;
for (i = 0; i < length; ++i)
satd += abs(coeff[i]);
return (int16_t)satd;
}
// Integer projection onto row vectors.
void vp9_int_pro_row_c(int16_t *hbuf, uint8_t const *ref,
const int ref_stride, const int height) {
......
This diff is collapsed.
......@@ -57,6 +57,141 @@ unsigned int vp9_avg_4x4_sse2(const uint8_t *s, int p) {
return (avg + 8) >> 4;
}
static void hadamard_col8_sse2(__m128i *in, int iter) {
__m128i a0 = in[0];
__m128i a1 = in[1];
__m128i a2 = in[2];
__m128i a3 = in[3];
__m128i a4 = in[4];
__m128i a5 = in[5];
__m128i a6 = in[6];
__m128i a7 = in[7];
__m128i b0 = _mm_add_epi16(a0, a1);
__m128i b1 = _mm_sub_epi16(a0, a1);
__m128i b2 = _mm_add_epi16(a2, a3);
__m128i b3 = _mm_sub_epi16(a2, a3);
__m128i b4 = _mm_add_epi16(a4, a5);
__m128i b5 = _mm_sub_epi16(a4, a5);
__m128i b6 = _mm_add_epi16(a6, a7);
__m128i b7 = _mm_sub_epi16(a6, a7);
a0 = _mm_add_epi16(b0, b2);
a1 = _mm_add_epi16(b1, b3);
a2 = _mm_sub_epi16(b0, b2);
a3 = _mm_sub_epi16(b1, b3);
a4 = _mm_add_epi16(b4, b6);
a5 = _mm_add_epi16(b5, b7);
a6 = _mm_sub_epi16(b4, b6);
a7 = _mm_sub_epi16(b5, b7);
if (iter == 0) {
b0 = _mm_add_epi16(a0, a4);
b1 = _mm_add_epi16(a1, a5);
b2 = _mm_add_epi16(a2, a6);
b3 = _mm_add_epi16(a3, a7);
b4 = _mm_sub_epi16(a0, a4);
b5 = _mm_sub_epi16(a1, a5);
b6 = _mm_sub_epi16(a2, a6);
b7 = _mm_sub_epi16(a3, a7);
a0 = _mm_unpacklo_epi16(b0, b1);
a1 = _mm_unpacklo_epi16(b2, b3);
a2 = _mm_unpackhi_epi16(b0, b1);
a3 = _mm_unpackhi_epi16(b2, b3);
a4 = _mm_unpacklo_epi16(b4, b5);
a5 = _mm_unpacklo_epi16(b6, b7);
a6 = _mm_unpackhi_epi16(b4, b5);
a7 = _mm_unpackhi_epi16(b6, b7);
b0 = _mm_unpacklo_epi32(a0, a1);
b1 = _mm_unpacklo_epi32(a4, a5);
b2 = _mm_unpackhi_epi32(a0, a1);
b3 = _mm_unpackhi_epi32(a4, a5);
b4 = _mm_unpacklo_epi32(a2, a3);
b5 = _mm_unpacklo_epi32(a6, a7);
b6 = _mm_unpackhi_epi32(a2, a3);
b7 = _mm_unpackhi_epi32(a6, a7);
in[0] = _mm_unpacklo_epi64(b0, b1);
in[7] = _mm_unpackhi_epi64(b0, b1);
in[3] = _mm_unpacklo_epi64(b2, b3);
in[4] = _mm_unpackhi_epi64(b2, b3);
in[2] = _mm_unpacklo_epi64(b4, b5);
in[6] = _mm_unpackhi_epi64(b4, b5);
in[1] = _mm_unpacklo_epi64(b6, b7);
in[5] = _mm_unpackhi_epi64(b6, b7);
} else {
in[0] = _mm_add_epi16(a0, a4);
in[7] = _mm_add_epi16(a1, a5);
in[3] = _mm_add_epi16(a2, a6);
in[4] = _mm_add_epi16(a3, a7);
in[2] = _mm_sub_epi16(a0, a4);
in[6] = _mm_sub_epi16(a1, a5);
in[1] = _mm_sub_epi16(a2, a6);
in[5] = _mm_sub_epi16(a3, a7);
}
}
void vp9_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
int16_t *coeff) {
__m128i src[8];
src[0] = _mm_load_si128((const __m128i *)src_diff);
src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
hadamard_col8_sse2(src, 0);
hadamard_col8_sse2(src, 1);
_mm_storeu_si128((__m128i *)coeff, src[0]);
coeff += 8;
_mm_storeu_si128((__m128i *)coeff, src[1]);
coeff += 8;
_mm_storeu_si128((__m128i *)coeff, src[2]);
coeff += 8;
_mm_storeu_si128((__m128i *)coeff, src[3]);
coeff += 8;
_mm_storeu_si128((__m128i *)coeff, src[4]);
coeff += 8;
_mm_storeu_si128((__m128i *)coeff, src[5]);
coeff += 8;
_mm_storeu_si128((__m128i *)coeff, src[6]);
coeff += 8;
_mm_storeu_si128((__m128i *)coeff, src[7]);
}
int16_t vp9_satd_sse2(const int16_t *coeff, int length) {
int i;
__m128i sum = _mm_load_si128((const __m128i *)coeff);
__m128i sign = _mm_srai_epi16(sum, 15);
__m128i val = _mm_xor_si128(sum, sign);
sum = _mm_sub_epi16(val, sign);
coeff += 8;
for (i = 8; i < length; i += 8) {
__m128i src_line = _mm_load_si128((const __m128i *)coeff);
sign = _mm_srai_epi16(src_line, 15);
val = _mm_xor_si128(src_line, sign);
val = _mm_sub_epi16(val, sign);
sum = _mm_add_epi16(sum, val);
coeff += 8;
}
val = _mm_srli_si128(sum, 8);
sum = _mm_add_epi16(sum, val);
val = _mm_srli_epi64(sum, 32);
sum = _mm_add_epi16(sum, val);
val = _mm_srli_epi32(sum, 16);
sum = _mm_add_epi16(sum, val);
return _mm_extract_epi16(sum, 0);
}
void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
const int ref_stride, const int height) {
int idx;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment