Commit d0dd01b8 authored by Yaowu Xu's avatar Yaowu Xu

Redo the forward 4x4 dct

The new fdct lowers the round trip sum squared error for a
4x4 block ~0.12. or ~0.008/pixel. For reference, the old
matrix multiply version has average round trip error 1.46
for a 4x4 block.

Thanks to "derf" for his suggestions and references.

Change-Id: I5559d1e81d333b319404ab16b336b739f87afc79
parent a5906668
......@@ -100,14 +100,9 @@ typedef struct
void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
void (*short_fdct4x4rd)(short *input, short *output, int pitch);
void (*short_fdct8x4rd)(short *input, short *output, int pitch);
void (*short_walsh4x4)(short *input, short *output, int pitch);
void (*quantize_b)(BLOCK *b, BLOCKD *d);
} MACROBLOCK;
......
......@@ -11,163 +11,54 @@
#include <math.h>
static const short dct_matrix2[4][4] =
{
{ 23170, 30274, 23170, 12540 },
{ 23170, 12540, -23170, -30274 },
{ 23170, -12540, -23170, 30274 },
{ 23170, -30274, 23170, -12540 }
};
static const short dct_matrix1[4][4] =
{
{ 23170, 23170, 23170, 23170 },
{ 30274, 12540, -12540, -30274 },
{ 23170, -23170, -23170, 23170 },
{ 12540, -30274, 30274, -12540 }
};
#define _1STSTAGESHIFT 14
#define _1STSTAGEROUNDING (1<<( _1STSTAGESHIFT-1))
#define _2NDSTAGESHIFT 16
#define _2NDSTAGEROUNDING (1<<( _2NDSTAGESHIFT-1))
// using matrix multiply
void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
{
int i, j, k;
short temp[4][4];
int sumtemp;
pitch >>= 1;
for (i = 0; i < 4; i++)
{
for (j = 0; j < 4; j++)
{
sumtemp = 0;
for (k = 0; k < 4; k++)
{
sumtemp += input[i*pitch+k] * dct_matrix2[k][j];
}
temp[i][j] = (short)((sumtemp + _1STSTAGEROUNDING) >> _1STSTAGESHIFT);
}
}
for (i = 0; i < 4; i++)
{
for (j = 0; j < 4; j++)
{
sumtemp = 0;
for (k = 0; k < 4; k++)
{
sumtemp += dct_matrix1[i][ k] * temp[k][ j];
}
output[i*4+j] = (short)((sumtemp + _2NDSTAGEROUNDING) >> _2NDSTAGESHIFT);
}
}
}
void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
{
vp8_short_fdct4x4_c(input, output, pitch);
vp8_short_fdct4x4_c(input + 4, output + 16, pitch);
}
static const signed short x_c1 = 60547;
static const signed short x_c2 = 46341;
static const signed short x_c3 = 25080;
void vp8_fast_fdct4x4_c(short *input, short *output, int pitch)
{
int i;
int a1, b1, c1, d1;
int a2, b2, c2, d2;
short *ip = input;
short *op = output;
int temp1, temp2;
for (i = 0; i < 4; i++)
{
a1 = (ip[0] + ip[3]) * 2;
b1 = (ip[1] + ip[2]) * 2;
c1 = (ip[1] - ip[2]) * 2;
d1 = (ip[0] - ip[3]) * 2;
temp1 = a1 + b1;
temp2 = a1 - b1;
op[0] = ((temp1 * x_c2) >> 16) + temp1;
op[2] = ((temp2 * x_c2) >> 16) + temp2;
temp1 = (c1 * x_c3) >> 16;
temp2 = ((d1 * x_c1) >> 16) + d1;
a1 = ((ip[0] + ip[3])<<3);
b1 = ((ip[1] + ip[2])<<3);
c1 = ((ip[1] - ip[2])<<3);
d1 = ((ip[0] - ip[3])<<3);
op[1] = temp1 + temp2;
temp1 = (d1 * x_c3) >> 16;
temp2 = ((c1 * x_c1) >> 16) + c1;
op[0] = a1 + b1;
op[2] = a1 - b1;
op[3] = temp1 - temp2;
op[1] = (c1 * 2217 + d1 * 5352 + 14500)>>12;
op[3] = (d1 * 2217 - c1 * 5352 + 7500)>>12;
ip += pitch / 2;
op += 4;
}
}
ip = output;
op = output;
for (i = 0; i < 4; i++)
{
a1 = ip[0] + ip[12];
b1 = ip[4] + ip[8];
c1 = ip[4] - ip[8];
d1 = ip[0] - ip[12];
op[0] = ( a1 + b1 + 7)>>4;
op[8] = ( a1 - b1 + 7)>>4;
temp1 = a1 + b1;
temp2 = a1 - b1;
a2 = ((temp1 * x_c2) >> 16) + temp1;
c2 = ((temp2 * x_c2) >> 16) + temp2;
temp1 = (c1 * x_c3) >> 16;
temp2 = ((d1 * x_c1) >> 16) + d1;
b2 = temp1 + temp2;
temp1 = (d1 * x_c3) >> 16;
temp2 = ((c1 * x_c1) >> 16) + c1;
d2 = temp1 - temp2;
op[0] = (a2 + 1) >> 1;
op[4] = (b2 + 1) >> 1;
op[8] = (c2 + 1) >> 1;
op[12] = (d2 + 1) >> 1;
op[4] =((c1 * 2217 + d1 * 5352 + 12000)>>16) + (d1!=0);
op[12] = (d1 * 2217 - c1 * 5352 + 51000)>>16;
ip++;
op++;
}
}
void vp8_fast_fdct8x4_c(short *input, short *output, int pitch)
void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
{
vp8_fast_fdct4x4_c(input, output, pitch);
vp8_fast_fdct4x4_c(input + 4, output + 16, pitch);
vp8_short_fdct4x4_c(input, output, pitch);
vp8_short_fdct4x4_c(input + 4, output + 16, pitch);
}
void vp8_short_walsh4x4_c(short *input, short *output, int pitch)
......
......@@ -32,16 +32,6 @@ extern prototype_fdct(vp8_fdct_short4x4);
#endif
extern prototype_fdct(vp8_fdct_short8x4);
#ifndef vp8_fdct_fast4x4
#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_c
#endif
extern prototype_fdct(vp8_fdct_fast4x4);
#ifndef vp8_fdct_fast8x4
#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_c
#endif
extern prototype_fdct(vp8_fdct_fast8x4);
#ifndef vp8_fdct_walsh_short4x4
#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_c
#endif
......
......@@ -66,7 +66,7 @@ void vp8_encode_intra4x4block_rd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x, BL
ENCODEMB_INVOKE(&rtcd->encodemb, subb)(be, b, 16);
x->short_fdct4x4rd(be->src_diff, be->coeff, 32);
x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
x->quantize_b(be, b);
......
......@@ -130,7 +130,8 @@ void vp8_transform_mbuvrd(MACROBLOCK *x)
for (i = 16; i < 24; i += 2)
{
x->short_fdct8x4rd(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16);
x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
&x->block[i].coeff[0], 16);
}
}
......@@ -140,14 +141,16 @@ void vp8_transform_intra_mby(MACROBLOCK *x)
for (i = 0; i < 16; i += 2)
{
x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
&x->block[i].coeff[0], 32);
}
// build dc block from 16 y dc values
vp8_build_dcblock(x);
// do 2nd order transform on the dc block
x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
x->short_walsh4x4(&x->block[24].src_diff[0],
&x->block[24].coeff[0], 8);
}
......@@ -157,14 +160,16 @@ void vp8_transform_intra_mbyrd(MACROBLOCK *x)
for (i = 0; i < 16; i += 2)
{
x->short_fdct8x4rd(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
&x->block[i].coeff[0], 32);
}
// build dc block from 16 y dc values
vp8_build_dcblock(x);
// do 2nd order transform on the dc block
x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
x->short_walsh4x4(&x->block[24].src_diff[0],
&x->block[24].coeff[0], 8);
}
void vp8_transform_mb(MACROBLOCK *x)
......@@ -173,7 +178,8 @@ void vp8_transform_mb(MACROBLOCK *x)
for (i = 0; i < 16; i += 2)
{
x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
&x->block[i].coeff[0], 32);
}
// build dc block from 16 y dc values
......@@ -182,12 +188,14 @@ void vp8_transform_mb(MACROBLOCK *x)
for (i = 16; i < 24; i += 2)
{
x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16);
x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
&x->block[i].coeff[0], 16);
}
// do 2nd order transform on the dc block
if (x->e_mbd.mbmi.mode != SPLITMV)
x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
x->short_walsh4x4(&x->block[24].src_diff[0],
&x->block[24].coeff[0], 8);
}
......@@ -197,14 +205,16 @@ void vp8_transform_mby(MACROBLOCK *x)
for (i = 0; i < 16; i += 2)
{
x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
&x->block[i].coeff[0], 32);
}
// build dc block from 16 y dc values
if (x->e_mbd.mbmi.mode != SPLITMV)
{
vp8_build_dcblock(x);
x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
x->short_walsh4x4(&x->block[24].src_diff[0],
&x->block[24].coeff[0], 8);
}
}
......@@ -214,7 +224,8 @@ void vp8_transform_mbrd(MACROBLOCK *x)
for (i = 0; i < 16; i += 2)
{
x->short_fdct8x4rd(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
&x->block[i].coeff[0], 32);
}
// build dc block from 16 y dc values
......@@ -223,12 +234,14 @@ void vp8_transform_mbrd(MACROBLOCK *x)
for (i = 16; i < 24; i += 2)
{
x->short_fdct8x4rd(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16);
x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
&x->block[i].coeff[0], 16);
}
// do 2nd order transform on the dc block
if (x->e_mbd.mbmi.mode != SPLITMV)
x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
x->short_walsh4x4(&x->block[24].src_diff[0],
&x->block[24].coeff[0], 8);
}
void vp8_stuff_inter16x16(MACROBLOCK *x)
......
......@@ -257,9 +257,6 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
z->vp8_short_fdct4x4 = x->vp8_short_fdct4x4;
z->vp8_short_fdct8x4 = x->vp8_short_fdct8x4;
z->short_fdct4x4rd = x->short_fdct4x4rd;
z->short_fdct8x4rd = x->short_fdct8x4rd;
z->short_fdct8x4rd = x->short_fdct8x4rd;
z->short_walsh4x4 = x->short_walsh4x4;
z->quantize_b = x->quantize_b;
......
......@@ -68,8 +68,8 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi)
cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c;
cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c;
cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_c;
cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_c;
cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_c;
cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_c;
cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_c;
cpi->rtcd.encodemb.berr = vp8_block_error_c;
......
......@@ -137,8 +137,6 @@ extern unsigned int inter_b_modes[15];
extern void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
extern void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
extern void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);
extern void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);
extern const int vp8_bits_per_mb[2][QINDEX_RANGE];
......@@ -1136,15 +1134,11 @@ void vp8_set_speed_features(VP8_COMP *cpi)
{
cpi->mb.vp8_short_fdct8x4 = FDCT_INVOKE(&cpi->rtcd.fdct, short8x4);
cpi->mb.vp8_short_fdct4x4 = FDCT_INVOKE(&cpi->rtcd.fdct, short4x4);
cpi->mb.short_fdct8x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, short8x4);
cpi->mb.short_fdct4x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, short4x4);
}
else
{
cpi->mb.vp8_short_fdct8x4 = FDCT_INVOKE(&cpi->rtcd.fdct, fast8x4);
cpi->mb.vp8_short_fdct4x4 = FDCT_INVOKE(&cpi->rtcd.fdct, fast4x4);
cpi->mb.short_fdct8x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, fast8x4);
cpi->mb.short_fdct4x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, fast4x4);
}
cpi->mb.short_walsh4x4 = FDCT_INVOKE(&cpi->rtcd.fdct, walsh_short4x4);
......
......@@ -1028,7 +1028,7 @@ static unsigned int vp8_encode_inter_mb_segment(MACROBLOCK *x, int const *labels
vp8_build_inter_predictors_b(bd, 16, x->e_mbd.subpixel_predict);
ENCODEMB_INVOKE(rtcd, subb)(be, bd, 16);
x->short_fdct4x4rd(be->src_diff, be->coeff, 32);
x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
// set to 0 no way to account for 2nd order DC so discount
//be->coeff[0] = 0;
......@@ -1056,7 +1056,7 @@ static void macro_block_yrd(MACROBLOCK *mb, int *Rate, int *Distortion, const vp
// Fdct and building the 2nd order block
for (beptr = mb->block; beptr < mb->block + 16; beptr += 2)
{
mb->short_fdct8x4rd(beptr->src_diff, beptr->coeff, 32);
mb->vp8_short_fdct8x4(beptr->src_diff, beptr->coeff, 32);
*Y2DCPtr++ = beptr->coeff[0];
*Y2DCPtr++ = beptr->coeff[16];
}
......
......@@ -181,10 +181,17 @@ void vp8_cmachine_specific_config(void)
// Willamette instruction set available:
vp8_mbuverror = vp8_mbuverror_xmm;
vp8_fast_quantize_b = vp8_fast_quantize_b_sse;
#if 0 //new fdct
vp8_short_fdct4x4 = vp8_short_fdct4x4_mmx;
vp8_short_fdct8x4 = vp8_short_fdct8x4_mmx;
vp8_fast_fdct4x4 = vp8_fast_fdct4x4_mmx;
vp8_fast_fdct8x4 = vp8_fast_fdct8x4_wmt;
vp8_fast_fdct4x4 = vp8_short_fdct4x4_mmx;
vp8_fast_fdct8x4 = vp8_short_fdct8x4_wmt;
#else
vp8_short_fdct4x4 = vp8_short_fdct4x4_c;
vp8_short_fdct8x4 = vp8_short_fdct8x4_c;
vp8_fast_fdct4x4 = vp8_short_fdct4x4_c;
vp8_fast_fdct8x4 = vp8_fast_fdct8x4_c;
#endif
vp8_subtract_b = vp8_subtract_b_mmx;
vp8_subtract_mbuv = vp8_subtract_mbuv_mmx;
vp8_variance4x4 = vp8_variance4x4_mmx;
......@@ -218,10 +225,17 @@ void vp8_cmachine_specific_config(void)
// MMX instruction set available:
vp8_mbuverror = vp8_mbuverror_mmx;
vp8_fast_quantize_b = vp8_fast_quantize_b_mmx;
#if 0 // new fdct
vp8_short_fdct4x4 = vp8_short_fdct4x4_mmx;
vp8_short_fdct8x4 = vp8_short_fdct8x4_mmx;
vp8_fast_fdct4x4 = vp8_fast_fdct4x4_mmx;
vp8_fast_fdct8x4 = vp8_fast_fdct8x4_mmx;
vp8_fast_fdct4x4 = vp8_short_fdct4x4_mmx;
vp8_fast_fdct8x4 = vp8_short_fdct8x4_mmx;
#else
vp8_short_fdct4x4 = vp8_short_fdct4x4_c;
vp8_short_fdct8x4 = vp8_short_fdct8x4_c;
vp8_fast_fdct4x4 = vp8_short_fdct4x4_c;
vp8_fast_fdct8x4 = vp8_fast_fdct8x4_c;
#endif
vp8_subtract_b = vp8_subtract_b_mmx;
vp8_subtract_mbuv = vp8_subtract_mbuv_mmx;
vp8_variance4x4 = vp8_variance4x4_mmx;
......@@ -254,10 +268,10 @@ void vp8_cmachine_specific_config(void)
{
// Pure C:
vp8_mbuverror = vp8_mbuverror_c;
vp8_fast_quantize_b = vp8_fast_quantize_b_c;
vp8_fast_quantize_b = vp8_fast_quantize_b_c;
vp8_short_fdct4x4 = vp8_short_fdct4x4_c;
vp8_short_fdct8x4 = vp8_short_fdct8x4_c;
vp8_fast_fdct4x4 = vp8_fast_fdct4x4_c;
vp8_fast_fdct4x4 = vp8_short_fdct4x4_c;
vp8_fast_fdct8x4 = vp8_fast_fdct8x4_c;
vp8_subtract_b = vp8_subtract_b_c;
vp8_subtract_mbuv = vp8_subtract_mbuv_c;
......
......@@ -13,8 +13,7 @@
section .text
global sym(vp8_short_fdct4x4_mmx)
global sym(vp8_fast_fdct4x4_mmx)
global sym(vp8_fast_fdct8x4_wmt)
global sym(vp8_short_fdct8x4_wmt)
%define DCTCONSTANTSBITS (16)
......@@ -24,339 +23,8 @@ section .text
%define x_c3 (25080) ; cos(pi*3/8) * (1<<15)
%define _1STSTAGESHIFT 14
%define _2NDSTAGESHIFT 16
; using matrix multiply with source and destbuffer has a pitch
;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
sym(vp8_short_fdct4x4_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 3
GET_GOT rbx
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;input
mov rdi, arg(1) ;output
movsxd rax, dword ptr arg(2) ;pitch
lea rdx, [dct_matrix GLOBAL]
movq mm0, [rsi ]
movq mm1, [rsi + rax]
movq mm2, [rsi + rax*2]
lea rsi, [rsi + rax*2]
movq mm3, [rsi + rax]
; first column
movq mm4, mm0
movq mm7, [rdx]
pmaddwd mm4, mm7
movq mm5, mm1
pmaddwd mm5, mm7
movq mm6, mm4
punpckldq mm4, mm5
punpckhdq mm6, mm5
paddd mm4, mm6
movq mm5, mm2
pmaddwd mm5, mm7
movq mm6, mm3
pmaddwd mm6, mm7
movq mm7, mm5
punpckldq mm5, mm6
punpckhdq mm7, mm6
paddd mm5, mm7
movq mm6, [dct1st_stage_rounding_mmx GLOBAL]
paddd mm4, mm6
paddd mm5, mm6
psrad mm4, _1STSTAGESHIFT
psrad mm5, _1STSTAGESHIFT
packssdw mm4, mm5
movq [rdi], mm4
;second column
movq mm4, mm0
pmaddwd mm4, [rdx+8]
movq mm5, mm1
pmaddwd mm5, [rdx+8]
movq mm6, mm4
punpckldq mm4, mm5
punpckhdq mm6, mm5
paddd mm4, mm6
movq mm5, mm2
pmaddwd mm5, [rdx+8]
movq mm6, mm3
pmaddwd mm6, [rdx+8]
movq mm7, mm5
punpckldq mm5, mm6
punpckhdq mm7, mm6
paddd mm5, mm7
movq mm6, [dct1st_stage_rounding_mmx GLOBAL]
paddd mm4, mm6
paddd mm5, mm6
psrad mm4, _1STSTAGESHIFT
psrad mm5, _1STSTAGESHIFT
packssdw mm4, mm5
movq [rdi+8], mm4
;third column
movq mm4, mm0
pmaddwd mm4, [rdx+16]
movq mm5, mm1
pmaddwd mm5, [rdx+16]
movq mm6, mm4
punpckldq mm4, mm5
punpckhdq mm6, mm5
paddd mm4, mm6
movq mm5, mm2
pmaddwd mm5, [rdx+16]
movq mm6, mm3
pmaddwd mm6, [rdx+16]
movq mm7, mm5
punpckldq mm5, mm6
punpckhdq mm7, mm6
paddd mm5, mm7
movq mm6, [dct1st_stage_rounding_mmx GLOBAL]
paddd mm4, mm6
paddd mm5, mm6
psrad mm4, _1STSTAGESHIFT
psrad mm5, _1STSTAGESHIFT
packssdw mm4, mm5
movq [rdi+16], mm4
;fourth column (this is the last column, so we do not have save the source any more)
pmaddwd mm0, [rdx+24]
pmaddwd mm1, [rdx+24]
movq mm6, mm0
punpckldq mm0, mm1
punpckhdq mm6, mm1
paddd mm0, mm6
pmaddwd mm2, [rdx+24]
pmaddwd mm3, [rdx+24]
movq mm7, mm2