Commit 043f4964 authored by Linfeng Zhang's avatar Linfeng Zhang

Implement fdct4x8_new_sse2 and fadst4x8_new_sse2

Change-Id: I9ab260c5ca31fe7e06bfc0f806893463c5255c45
parent 1fffc1f4
......@@ -24,6 +24,22 @@
extern "C" {
static INLINE void btf_16_w4_sse2(
const __m128i *const w0, const __m128i *const w1, const __m128i __rounding,
const int8_t cos_bit, const __m128i *const in0, const __m128i *const in1,
__m128i *const out0, __m128i *const out1) {
const __m128i t0 = _mm_unpacklo_epi16(*in0, *in1);
const __m128i u0 = _mm_madd_epi16(t0, *w0);
const __m128i v0 = _mm_madd_epi16(t0, *w1);
const __m128i a0 = _mm_add_epi32(u0, __rounding);
const __m128i b0 = _mm_add_epi32(v0, __rounding);
const __m128i c0 = _mm_srai_epi32(a0, cos_bit);
const __m128i d0 = _mm_srai_epi32(b0, cos_bit);
*out0 = _mm_packs_epi32(c0, c0);
*out1 = _mm_packs_epi32(d0, c0);
#define btf_16_sse2(w0, w1, in0, in1, out0, out1) \
{ \
__m128i t0 = _mm_unpacklo_epi16(in0, in1); \
This diff is collapsed.
