Commit 5b6d33f9 authored by Christian Duvivier's avatar Christian Duvivier
Browse files

Faster vp9_short_fdct4x4 and vp9_short_fdct8x4.

Scalar path is about 1.3x faster (2.1% overall encoder speedup).
SSE2 path is about 5.0x faster (8.4% overall encoder speedup).

Change-Id: I360d167b5ad6f387bba00406129323e2fe6e7dda
parent 7f7d1357
......@@ -645,10 +645,10 @@ prototype void vp9_short_fdct8x8 "int16_t *InputData, int16_t *OutputData, int p
specialize vp9_short_fdct8x8 sse2
prototype void vp9_short_fdct4x4 "int16_t *InputData, int16_t *OutputData, int pitch"
specialize vp9_short_fdct4x4
specialize vp9_short_fdct4x4 sse2
prototype void vp9_short_fdct8x4 "int16_t *InputData, int16_t *OutputData, int pitch"
specialize vp9_short_fdct8x4
specialize vp9_short_fdct8x4 sse2
prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int pitch"
specialize vp9_short_fdct32x32
......
......@@ -37,30 +37,68 @@ static void fdct4_1d(int16_t *input, int16_t *output) {
}
void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int pitch) {
int16_t out[4 * 4];
int16_t *outptr = &out[0];
const int short_pitch = pitch >> 1;
int i, j;
int16_t temp_in[4], temp_out[4];
// Columns
for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j)
temp_in[j] = input[j * short_pitch + i] << 4;
if (i == 0 && temp_in[0])
temp_in[0] += 1;
fdct4_1d(temp_in, temp_out);
for (j = 0; j < 4; ++j)
outptr[j * 4 + i] = temp_out[j];
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
// the results. In the second one, we transform the rows. To achieve that,
// as the first pass results are transposed, we tranpose the columns (that
// is the transposed rows) and transpose the results (so that it goes back
// in normal/row positions).
const int stride = pitch >> 1;
int pass;
// We need an intermediate buffer between passes.
int16_t intermediate[4 * 4];
int16_t *in = input;
int16_t *out = intermediate;
// Do the two transform/transpose passes
for (pass = 0; pass < 2; ++pass) {
/*canbe16*/ int input[4];
/*canbe16*/ int step[4];
/*needs32*/ int temp1, temp2;
int i;
for (i = 0; i < 4; ++i) {
// Load inputs.
if (0 == pass) {
input[0] = in[0 * stride] << 4;
input[1] = in[1 * stride] << 4;
input[2] = in[2 * stride] << 4;
input[3] = in[3 * stride] << 4;
if (i == 0 && input[0]) {
input[0] += 1;
}
} else {
input[0] = in[0 * 4];
input[1] = in[1 * 4];
input[2] = in[2 * 4];
input[3] = in[3 * 4];
}
// Transform.
step[0] = input[0] + input[3];
step[1] = input[1] + input[2];
step[2] = input[1] - input[2];
step[3] = input[0] - input[3];
temp1 = (step[0] + step[1]) * cospi_16_64;
temp2 = (step[0] - step[1]) * cospi_16_64;
out[0] = dct_const_round_shift(temp1);
out[2] = dct_const_round_shift(temp2);
temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
out[1] = dct_const_round_shift(temp1);
out[3] = dct_const_round_shift(temp2);
// Do next column (which is a transposed row in second/horizontal pass)
in++;
out += 4;
}
// Setup in/out for next pass.
in = intermediate;
out = output;
}
// Rows
for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j)
temp_in[j] = out[j + i * 4];
fdct4_1d(temp_in, temp_out);
for (j = 0; j < 4; ++j)
output[j + i * 4] = (temp_out[j] + 1) >> 2;
{
int i, j;
for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j)
output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
}
}
}
......
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
%macro STACK_FRAME_CREATE 0
%if ABI_IS_32BIT
%define input rsi
%define output rdi
%define pitch rax
push rbp
mov rbp, rsp
GET_GOT rbx
push rsi
push rdi
; end prolog
mov rsi, arg(0)
mov rdi, arg(1)
movsxd rax, dword ptr arg(2)
lea rcx, [rsi + rax*2]
%else
%if LIBVPX_YASM_WIN64
%define input rcx
%define output rdx
%define pitch r8
SAVE_XMM 7, u
%else
%define input rdi
%define output rsi
%define pitch rdx
%endif
%endif
%endmacro
%macro STACK_FRAME_DESTROY 0
%define input
%define output
%define pitch
%if ABI_IS_32BIT
pop rdi
pop rsi
RESTORE_GOT
pop rbp
%else
%if LIBVPX_YASM_WIN64
RESTORE_XMM
%endif
%endif
ret
%endmacro
;void vp9_short_fdct4x4_sse2(short *input, short *output, int pitch)
global sym(vp9_short_fdct4x4_sse2) PRIVATE
sym(vp9_short_fdct4x4_sse2):
STACK_FRAME_CREATE
movq xmm0, MMWORD PTR[input ] ;03 02 01 00
movq xmm2, MMWORD PTR[input+ pitch] ;13 12 11 10
lea input, [input+2*pitch]
movq xmm1, MMWORD PTR[input ] ;23 22 21 20
movq xmm3, MMWORD PTR[input+ pitch] ;33 32 31 30
punpcklqdq xmm0, xmm2 ;13 12 11 10 03 02 01 00
punpcklqdq xmm1, xmm3 ;33 32 31 30 23 22 21 20
movdqa xmm2, xmm0
punpckldq xmm0, xmm1 ;23 22 03 02 21 20 01 00
punpckhdq xmm2, xmm1 ;33 32 13 12 31 30 11 10
movdqa xmm1, xmm0
punpckldq xmm0, xmm2 ;31 21 30 20 11 10 01 00
pshufhw xmm1, xmm1, 0b1h ;22 23 02 03 xx xx xx xx
pshufhw xmm2, xmm2, 0b1h ;32 33 12 13 xx xx xx xx
punpckhdq xmm1, xmm2 ;32 33 22 23 12 13 02 03
movdqa xmm3, xmm0
paddw xmm0, xmm1 ;b1 a1 b1 a1 b1 a1 b1 a1
psubw xmm3, xmm1 ;c1 d1 c1 d1 c1 d1 c1 d1
psllw xmm0, 3 ;b1 <<= 3 a1 <<= 3
psllw xmm3, 3 ;c1 <<= 3 d1 <<= 3
movdqa xmm1, xmm0
pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
movdqa xmm4, xmm3
pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352
pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352
paddd xmm3, XMMWORD PTR[GLOBAL(_14500)]
paddd xmm4, XMMWORD PTR[GLOBAL(_7500)]
psrad xmm3, 12 ;(c1 * 2217 + d1 * 5352 + 14500)>>12
psrad xmm4, 12 ;(d1 * 2217 - c1 * 5352 + 7500)>>12
packssdw xmm0, xmm1 ;op[2] op[0]
packssdw xmm3, xmm4 ;op[3] op[1]
; 23 22 21 20 03 02 01 00
;
; 33 32 31 30 13 12 11 10
;
movdqa xmm2, xmm0
punpcklqdq xmm0, xmm3 ;13 12 11 10 03 02 01 00
punpckhqdq xmm2, xmm3 ;23 22 21 20 33 32 31 30
movdqa xmm3, xmm0
punpcklwd xmm0, xmm2 ;32 30 22 20 12 10 02 00
punpckhwd xmm3, xmm2 ;33 31 23 21 13 11 03 01
movdqa xmm2, xmm0
punpcklwd xmm0, xmm3 ;13 12 11 10 03 02 01 00
punpckhwd xmm2, xmm3 ;33 32 31 30 23 22 21 20
movdqa xmm5, XMMWORD PTR[GLOBAL(_7)]
pshufd xmm2, xmm2, 04eh
movdqa xmm3, xmm0
paddw xmm0, xmm2 ;b1 b1 b1 b1 a1 a1 a1 a1
psubw xmm3, xmm2 ;c1 c1 c1 c1 d1 d1 d1 d1
pshufd xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 b1 a1 a1
movdqa xmm2, xmm3 ;save d1 for compare
pshufd xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 c1 d1 d1
pshuflw xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 a1 b1 a1
pshuflw xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 d1 c1 d1
pshufhw xmm0, xmm0, 0d8h ;b1 a1 b1 a1 b1 a1 b1 a1
pshufhw xmm3, xmm3, 0d8h ;c1 d1 c1 d1 c1 d1 c1 d1
movdqa xmm1, xmm0
pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
pxor xmm4, xmm4 ;zero out for compare
paddd xmm0, xmm5
paddd xmm1, xmm5
pcmpeqw xmm2, xmm4
psrad xmm0, 4 ;(a1 + b1 + 7)>>4
psrad xmm1, 4 ;(a1 - b1 + 7)>>4
pandn xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper,
;and keep bit 0 of lower
movdqa xmm4, xmm3
pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352
pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352
paddd xmm3, XMMWORD PTR[GLOBAL(_12000)]
paddd xmm4, XMMWORD PTR[GLOBAL(_51000)]
packssdw xmm0, xmm1 ;op[8] op[0]
psrad xmm3, 16 ;(c1 * 2217 + d1 * 5352 + 12000)>>16
psrad xmm4, 16 ;(d1 * 2217 - c1 * 5352 + 51000)>>16
packssdw xmm3, xmm4 ;op[12] op[4]
movdqa xmm1, xmm0
paddw xmm3, xmm2 ;op[4] += (d1!=0)
punpcklqdq xmm0, xmm3 ;op[4] op[0]
punpckhqdq xmm1, xmm3 ;op[12] op[8]
movdqa XMMWORD PTR[output + 0], xmm0
movdqa XMMWORD PTR[output + 16], xmm1
STACK_FRAME_DESTROY
;void vp9_short_fdct8x4_sse2(short *input, short *output, int pitch)
global sym(vp9_short_fdct8x4_sse2) PRIVATE
sym(vp9_short_fdct8x4_sse2):
STACK_FRAME_CREATE
; read the input data
movdqa xmm0, [input ]
movdqa xmm2, [input+ pitch]
lea input, [input+2*pitch]
movdqa xmm4, [input ]
movdqa xmm3, [input+ pitch]
; transpose for the first stage
movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07
movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27
punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13
punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17
punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33
punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37
movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13
punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31
punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33
movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17
punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35
punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37
movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33
punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37
punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36
movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31
punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34
punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35
; xmm0 0
; xmm1 1
; xmm2 2
; xmm3 3
; first stage
movdqa xmm5, xmm0
movdqa xmm4, xmm1
paddw xmm0, xmm3 ; a1 = 0 + 3
paddw xmm1, xmm2 ; b1 = 1 + 2
psubw xmm4, xmm2 ; c1 = 1 - 2
psubw xmm5, xmm3 ; d1 = 0 - 3
psllw xmm5, 3
psllw xmm4, 3
psllw xmm0, 3
psllw xmm1, 3
; output 0 and 2
movdqa xmm2, xmm0 ; a1
paddw xmm0, xmm1 ; op[0] = a1 + b1
psubw xmm2, xmm1 ; op[2] = a1 - b1
; output 1 and 3
; interleave c1, d1
movdqa xmm1, xmm5 ; d1
punpcklwd xmm1, xmm4 ; c1 d1
punpckhwd xmm5, xmm4 ; c1 d1
movdqa xmm3, xmm1
movdqa xmm4, xmm5
pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
paddd xmm1, XMMWORD PTR[GLOBAL(_14500)]
paddd xmm4, XMMWORD PTR[GLOBAL(_14500)]
paddd xmm3, XMMWORD PTR[GLOBAL(_7500)]
paddd xmm5, XMMWORD PTR[GLOBAL(_7500)]
psrad xmm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
psrad xmm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
psrad xmm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
psrad xmm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
packssdw xmm1, xmm4 ; op[1]
packssdw xmm3, xmm5 ; op[3]
; done with vertical
; transpose for the second stage
movdqa xmm4, xmm0 ; 00 10 20 30 04 14 24 34
movdqa xmm5, xmm2 ; 02 12 22 32 06 16 26 36
punpcklwd xmm0, xmm1 ; 00 01 10 11 20 21 30 31
punpckhwd xmm4, xmm1 ; 04 05 14 15 24 25 34 35
punpcklwd xmm2, xmm3 ; 02 03 12 13 22 23 32 33
punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37
movdqa xmm1, xmm0 ; 00 01 10 11 20 21 30 31
punpckldq xmm0, xmm2 ; 00 01 02 03 10 11 12 13
punpckhdq xmm1, xmm2 ; 20 21 22 23 30 31 32 33
movdqa xmm2, xmm4 ; 04 05 14 15 24 25 34 35
punpckldq xmm2, xmm5 ; 04 05 06 07 14 15 16 17
punpckhdq xmm4, xmm5 ; 24 25 26 27 34 35 36 37
movdqa xmm3, xmm1 ; 20 21 22 23 30 31 32 33
punpckhqdq xmm3, xmm4 ; 30 31 32 33 34 35 36 37
punpcklqdq xmm1, xmm4 ; 20 21 22 23 24 25 26 27
movdqa xmm4, xmm0 ; 00 01 02 03 10 11 12 13
punpcklqdq xmm0, xmm2 ; 00 01 02 03 04 05 06 07
punpckhqdq xmm4, xmm2 ; 10 11 12 13 14 15 16 17
; xmm0 0
; xmm1 4
; xmm2 1
; xmm3 3
movdqa xmm5, xmm0
movdqa xmm2, xmm1
paddw xmm0, xmm3 ; a1 = 0 + 3
paddw xmm1, xmm4 ; b1 = 1 + 2
psubw xmm4, xmm2 ; c1 = 1 - 2
psubw xmm5, xmm3 ; d1 = 0 - 3
pxor xmm6, xmm6 ; zero out for compare
pcmpeqw xmm6, xmm5 ; d1 != 0
pandn xmm6, XMMWORD PTR[GLOBAL(_cmp_mask8x4)] ; clear upper,
; and keep bit 0 of lower
; output 0 and 2
movdqa xmm2, xmm0 ; a1
paddw xmm0, xmm1 ; a1 + b1
psubw xmm2, xmm1 ; a1 - b1
paddw xmm0, XMMWORD PTR[GLOBAL(_7w)]
paddw xmm2, XMMWORD PTR[GLOBAL(_7w)]
psraw xmm0, 4 ; op[0] = (a1 + b1 + 7)>>4
psraw xmm2, 4 ; op[8] = (a1 - b1 + 7)>>4
; output 1 and 3
; interleave c1, d1
movdqa xmm1, xmm5 ; d1
punpcklwd xmm1, xmm4 ; c1 d1
punpckhwd xmm5, xmm4 ; c1 d1
movdqa xmm3, xmm1
movdqa xmm4, xmm5
pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
paddd xmm1, XMMWORD PTR[GLOBAL(_12000)]
paddd xmm4, XMMWORD PTR[GLOBAL(_12000)]
paddd xmm3, XMMWORD PTR[GLOBAL(_51000)]
paddd xmm5, XMMWORD PTR[GLOBAL(_51000)]
psrad xmm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
psrad xmm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
psrad xmm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
psrad xmm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
packssdw xmm1, xmm4 ; op[4]
packssdw xmm3, xmm5 ; op[12]
paddw xmm1, xmm6 ; op[4] += (d1!=0)
movdqa xmm4, xmm0
movdqa xmm5, xmm2
punpcklqdq xmm0, xmm1
punpckhqdq xmm4, xmm1
punpcklqdq xmm2, xmm3
punpckhqdq xmm5, xmm3
movdqa XMMWORD PTR[output + 0 ], xmm0
movdqa XMMWORD PTR[output + 16], xmm2
movdqa XMMWORD PTR[output + 32], xmm4
movdqa XMMWORD PTR[output + 48], xmm5
STACK_FRAME_DESTROY
SECTION_RODATA
align 16
_5352_2217:
dw 5352
dw 2217
dw 5352
dw 2217
dw 5352
dw 2217
dw 5352
dw 2217
align 16
_2217_neg5352:
dw 2217
dw -5352
dw 2217
dw -5352
dw 2217
dw -5352
dw 2217
dw -5352
align 16
_mult_add:
times 8 dw 1
align 16
_cmp_mask:
times 4 dw 1
times 4 dw 0
align 16
_cmp_mask8x4:
times 8 dw 1
align 16
_mult_sub:
dw 1
dw -1
dw 1
dw -1
dw 1
dw -1
dw 1
dw -1
align 16
_7:
times 4 dd 7
align 16
_7w:
times 8 dw 7
align 16
_14500:
times 4 dd 14500
align 16
_7500:
times 4 dd 7500
align 16
_12000:
times 4 dd 12000
align 16
_51000:
times 4 dd 51000
......@@ -11,6 +11,111 @@
#include <emmintrin.h> // SSE2
#include "vp9/common/vp9_idct.h" // for cospi constants
void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
// the results. In the second one, we transform the rows. To achieve that,
// as the first pass results are transposed, we tranpose the columns (that
// is the transposed rows) and transpose the results (so that it goes back
// in normal/row positions).
const int stride = pitch >> 1;
int pass;
// Constants
// When we use them, in one case, they are all the same. In all others
// it's a pair of them that we need to repeat four times. This is done
// by constructing the 32 bit constant corresponding to that pair.
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
const __m128i kOne = _mm_set1_epi16(1);
__m128i in0, in1, in2, in3;
// Load inputs.
{
in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
in2 = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
in3 = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
// x = x << 4
in0 = _mm_slli_epi16(in0, 4);
in1 = _mm_slli_epi16(in1, 4);
in2 = _mm_slli_epi16(in2, 4);
in3 = _mm_slli_epi16(in3, 4);
// if (i == 0 && input[0]) input[0] += 1;
{
// The mask will only contain wether the first value is zero, all
// other comparison will fail as something shifted by 4 (above << 4)
// can never be equal to one. To increment in the non-zero case, we
// add the mask and one for the first element:
// - if zero, mask = -1, v = v - 1 + 1 = v
// - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
__m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
in0 = _mm_add_epi16(in0, mask);
in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
}
}
// Do the two transform/transpose passes
for (pass = 0; pass < 2; ++pass) {
// Transform 1/2: Add/substract
const __m128i r0 = _mm_add_epi16(in0, in3);
const __m128i r1 = _mm_add_epi16(in1, in2);
const __m128i r2 = _mm_sub_epi16(in1, in2);
const __m128i r3 = _mm_sub_epi16(in0, in3);
// Transform 1/2: Interleave to do the multiply by constants which gets us
// into 32 bits.
const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);