Changes 16 point idct

This commit changes the inverse 16 point dct to use the same algorithm
as the one for 32 point idct. In fact, now 16 point dct uses the exact
version of the souce code for even portion of the 32 point idct.

Tests showed current implementation has significant better accuracy
than the previous version. With this implementation and the minor bug
fix on forward 16 point dct, encoding tests showed about 0.2% better
compression of CIF set, test results on std-hd setting pending.

Change-Id: I68224b60c816ba03434e9f08bee147c7e344fb63
......@@ -398,6 +398,10 @@ specialize vp9_short_idct16x16
prototype void vp9_short_idct10_16x16 "int16_t *input, int16_t *output, int pitch"
specialize vp9_short_idct10_16x16
prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output"
specialize vp9_short_idct1_16x16
prototype void vp9_short_idct32x32 "int16_t *input, int16_t *output, int pitch"
specialize vp9_short_idct32x32
......@@ -300,14 +300,11 @@ void vp9_dequant_idct_add_16x16_c(int16_t *input, const int16_t *dq,
vp9_copy_mem16x16(pred, pitch, dest, stride);
} else if (eob == 1) {
/* DC only DCT coefficient. */
int16_t in = input[0] * dq[0];
int16_t out;
/* Note: the idct1 will need to be modified accordingly whenever
* vp9_short_idct16x16_c() is modified. */
out = (input[0] * dq[0] + 2) >> 2;
out = (out + 2) >> 2;
out = (out + 4) >> 3;
vp9_short_idct1_16x16_c(&in, &out);
input[0] = 0;
add_constant_residual(out, pred, pitch, dest, stride, 16, 16);
