Commit c2368362 authored by Frank Bossen

Speed up SSE4 implementation of 64-point inverse transform

Avoid unnecessary computations knowing that only the lower
frequency 32x32 quadrant has nonzero values.

Runs about 2x faster

Change-Id: Ie86f56ccdce917e30b594253f10e121b4dcb0abc
parent ae6e6bc1
......@@ -90,4 +90,14 @@ static INLINE __m128i half_btf_sse4_1(const __m128i *w0, const __m128i *n0,
return x;
static INLINE __m128i half_btf_0_sse4_1(const __m128i *w0, const __m128i *n0,
const __m128i *rounding, int bit) {
__m128i x;
x = _mm_mullo_epi32(*w0, *n0);
x = _mm_add_epi32(x, *rounding);
x = _mm_srai_epi32(x, bit);
return x;
