Commit a272ff25 authored by Scott LaVarnway's avatar Scott LaVarnway

WIP: 16x16 idct/recon merge

This patch eliminates the intermediate diff buffer usage by
combining the short idct and the add residual into one function.
The encoder can use the same code as well.

Change-Id: Iea7976b22b1927d24b8004d2a3fddae7ecca3ba1
parent 2cf0d4be
......@@ -17,6 +17,7 @@
extern "C" {
#include "vp9/common/vp9_entropy.h"
#include "vp9_rtcd.h"
void vp9_short_idct16x16_add_c(short *input, uint8_t *output, int pitch);
}
#include "acm_random.h"
......@@ -269,19 +270,23 @@ TEST(VP9Idct16x16Test, AccuracyCheck) {
const int count_test_block = 1000;
for (int i = 0; i < count_test_block; ++i) {
int16_t in[256], coeff[256];
int16_t out_c[256];
uint8_t dst[256], src[256];
double out_r[256];
for (int j = 0; j < 256; ++j) {
src[j] = rnd.Rand8();
dst[j] = rnd.Rand8();
}
// Initialize a test block with input range [-255, 255].
for (int j = 0; j < 256; ++j)
in[j] = rnd.Rand8() - rnd.Rand8();
in[j] = src[j] - dst[j];
reference_16x16_dct_2d(in, out_r);
for (int j = 0; j < 256; j++)
coeff[j] = round(out_r[j]);
vp9_short_idct16x16_c(coeff, out_c, 32);
vp9_short_idct16x16_add_c(coeff, dst, 16);
for (int j = 0; j < 256; ++j) {
const int diff = out_c[j] - in[j];
const int diff = dst[j] - src[j];
const int error = diff * diff;
EXPECT_GE(1, error)
<< "Error: 16x16 IDCT has error " << error
......@@ -289,7 +294,7 @@ TEST(VP9Idct16x16Test, AccuracyCheck) {
}
}
}
#if 1
// we need enable fdct test once we re-do the 16 point fdct.
TEST(VP9Fdct16x16Test, AccuracyCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
......@@ -299,18 +304,22 @@ TEST(VP9Fdct16x16Test, AccuracyCheck) {
for (int i = 0; i < count_test_block; ++i) {
int16_t test_input_block[256];
int16_t test_temp_block[256];
int16_t test_output_block[256];
uint8_t dst[256], src[256];
for (int j = 0; j < 256; ++j) {
src[j] = rnd.Rand8();
dst[j] = rnd.Rand8();
}
// Initialize a test block with input range [-255, 255].
for (int j = 0; j < 256; ++j)
test_input_block[j] = rnd.Rand8() - rnd.Rand8();
test_input_block[j] = src[j] - dst[j];
const int pitch = 32;
vp9_short_fdct16x16_c(test_input_block, test_temp_block, pitch);
vp9_short_idct16x16_c(test_temp_block, test_output_block, pitch);
vp9_short_idct16x16_add_c(test_temp_block, dst, 16);
for (int j = 0; j < 256; ++j) {
const int diff = test_input_block[j] - test_output_block[j];
const int diff = dst[j] - src[j];
const int error = diff * diff;
if (max_error < error)
max_error = error;
......@@ -354,6 +363,4 @@ TEST(VP9Fdct16x16Test, CoeffSizeCheck) {
}
}
}
#endif
} // namespace
......@@ -621,10 +621,9 @@ static void idct16_1d(int16_t *input, int16_t *output) {
output[15] = step2[0] - step2[15];
}
void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {
void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
int16_t out[16 * 16];
int16_t *outptr = out;
const int half_pitch = pitch >> 1;
int i, j;
int16_t temp_in[16], temp_out[16];
......@@ -641,7 +640,8 @@ void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {
temp_in[j] = out[j * 16 + i];
idct16_1d(temp_in, temp_out);
for (j = 0; j < 16; ++j)
output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ dest[j * dest_stride + i]);
}
}
......@@ -823,8 +823,8 @@ static const transform_2d IHT_16[] = {
{ iadst16_1d, iadst16_1d } // ADST_ADST = 3
};
void vp9_short_iht16x16_c(int16_t *input, int16_t *output,
int pitch, int tx_type) {
void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride,
int tx_type) {
int i, j;
int16_t out[16 * 16];
int16_t *outptr = out;
......@@ -844,14 +844,14 @@ void vp9_short_iht16x16_c(int16_t *input, int16_t *output,
temp_in[j] = out[j * 16 + i];
ht.cols(temp_in, temp_out);
for (j = 0; j < 16; ++j)
output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
}
dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ dest[j * dest_stride + i]); }
}
void vp9_short_idct10_16x16_c(int16_t *input, int16_t *output, int pitch) {
void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest,
int dest_stride) {
int16_t out[16 * 16];
int16_t *outptr = out;
const int half_pitch = pitch >> 1;
int i, j;
int16_t temp_in[16], temp_out[16];
......@@ -871,11 +871,11 @@ void vp9_short_idct10_16x16_c(int16_t *input, int16_t *output, int pitch) {
temp_in[j] = out[j*16 + i];
idct16_1d(temp_in, temp_out);
for (j = 0; j < 16; ++j)
output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ dest[j * dest_stride + i]);
}
}
void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output) {
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
out = dct_const_round_shift(out * cospi_16_64);
......
......@@ -91,9 +91,6 @@ specialize vp9_add_residual_4x4 sse2
prototype void vp9_add_residual_8x8 "const int16_t *diff, uint8_t *dest, int stride"
specialize vp9_add_residual_8x8 sse2
prototype void vp9_add_residual_16x16 "const int16_t *diff, uint8_t *dest, int stride"
specialize vp9_add_residual_16x16 sse2
prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"
specialize vp9_add_constant_residual_8x8 sse2
......@@ -200,11 +197,11 @@ specialize vp9_short_idct10_8x8 sse2
prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output"
specialize vp9_short_idct1_8x8
prototype void vp9_short_idct16x16 "int16_t *input, int16_t *output, int pitch"
specialize vp9_short_idct16x16 sse2
prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct16x16_add sse2
prototype void vp9_short_idct10_16x16 "int16_t *input, int16_t *output, int pitch"
specialize vp9_short_idct10_16x16 sse2
prototype void vp9_short_idct10_16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct10_16x16_add sse2
prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output"
specialize vp9_short_idct1_16x16
......@@ -224,8 +221,8 @@ specialize vp9_short_iht8x8
prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type"
specialize vp9_short_iht4x4
prototype void vp9_short_iht16x16 "int16_t *input, int16_t *output, int pitch, int tx_type"
specialize vp9_short_iht16x16
prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type"
specialize vp9_short_iht16x16_add
prototype void vp9_idct4_1d "int16_t *input, int16_t *output"
specialize vp9_idct4_1d sse2
......
......@@ -752,8 +752,17 @@ void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) {
stp2_10, stp2_13, stp2_11, stp2_12) \
}
void vp9_short_idct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
const int half_pitch = pitch >> 1;
#define RECON_AND_STORE(dest, in_x) \
{ \
__m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
d0 = _mm_unpacklo_epi8(d0, zero); \
in_x = _mm_add_epi16(in_x, d0); \
in_x = _mm_packus_epi16(in_x, in_x); \
_mm_storel_epi64((__m128i *)(dest), in_x); \
dest += stride; \
}
void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<5);
const __m128i zero = _mm_setzero_si128();
......@@ -938,31 +947,30 @@ void vp9_short_idct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
in14 = _mm_srai_epi16(in14, 6);
in15 = _mm_srai_epi16(in15, 6);
// Store results
_mm_store_si128((__m128i *)output, in0);
_mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
_mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
_mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
_mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
_mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
_mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
_mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
_mm_store_si128((__m128i *)(output + half_pitch * 8), in8);
_mm_store_si128((__m128i *)(output + half_pitch * 9), in9);
_mm_store_si128((__m128i *)(output + half_pitch * 10), in10);
_mm_store_si128((__m128i *)(output + half_pitch * 11), in11);
_mm_store_si128((__m128i *)(output + half_pitch * 12), in12);
_mm_store_si128((__m128i *)(output + half_pitch * 13), in13);
_mm_store_si128((__m128i *)(output + half_pitch * 14), in14);
_mm_store_si128((__m128i *)(output + half_pitch * 15), in15);
output += 8;
RECON_AND_STORE(dest, in0);
RECON_AND_STORE(dest, in1);
RECON_AND_STORE(dest, in2);
RECON_AND_STORE(dest, in3);
RECON_AND_STORE(dest, in4);
RECON_AND_STORE(dest, in5);
RECON_AND_STORE(dest, in6);
RECON_AND_STORE(dest, in7);
RECON_AND_STORE(dest, in8);
RECON_AND_STORE(dest, in9);
RECON_AND_STORE(dest, in10);
RECON_AND_STORE(dest, in11);
RECON_AND_STORE(dest, in12);
RECON_AND_STORE(dest, in13);
RECON_AND_STORE(dest, in14);
RECON_AND_STORE(dest, in15);
dest += 8 - (stride * 16);
}
}
}
void vp9_short_idct10_16x16_sse2(int16_t *input, int16_t *output, int pitch) {
const int half_pitch = pitch >> 1;
void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest,
int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<5);
const __m128i zero = _mm_setzero_si128();
......@@ -1007,7 +1015,6 @@ void vp9_short_idct10_16x16_sse2(int16_t *input, int16_t *output, int pitch) {
stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int i;
// 1-D idct. Load input data.
in0 = _mm_load_si128((__m128i *)input);
in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
......@@ -1298,24 +1305,24 @@ void vp9_short_idct10_16x16_sse2(int16_t *input, int16_t *output, int pitch) {
in14 = _mm_srai_epi16(in14, 6);
in15 = _mm_srai_epi16(in15, 6);
// Store results
_mm_store_si128((__m128i *)output, in0);
_mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
_mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
_mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
_mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
_mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
_mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
_mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
_mm_store_si128((__m128i *)(output + half_pitch * 8), in8);
_mm_store_si128((__m128i *)(output + half_pitch * 9), in9);
_mm_store_si128((__m128i *)(output + half_pitch * 10), in10);
_mm_store_si128((__m128i *)(output + half_pitch * 11), in11);
_mm_store_si128((__m128i *)(output + half_pitch * 12), in12);
_mm_store_si128((__m128i *)(output + half_pitch * 13), in13);
_mm_store_si128((__m128i *)(output + half_pitch * 14), in14);
_mm_store_si128((__m128i *)(output + half_pitch * 15), in15);
output += 8;
RECON_AND_STORE(dest, in0);
RECON_AND_STORE(dest, in1);
RECON_AND_STORE(dest, in2);
RECON_AND_STORE(dest, in3);
RECON_AND_STORE(dest, in4);
RECON_AND_STORE(dest, in5);
RECON_AND_STORE(dest, in6);
RECON_AND_STORE(dest, in7);
RECON_AND_STORE(dest, in8);
RECON_AND_STORE(dest, in9);
RECON_AND_STORE(dest, in10);
RECON_AND_STORE(dest, in11);
RECON_AND_STORE(dest, in12);
RECON_AND_STORE(dest, in13);
RECON_AND_STORE(dest, in14);
RECON_AND_STORE(dest, in15);
dest += 8 - (stride * 16);
}
}
......@@ -1934,16 +1941,6 @@ void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
in30 = _mm_srai_epi16(in30, 6);
in31 = _mm_srai_epi16(in31, 6);
#define RECON_AND_STORE(dest, in_x) \
{ \
__m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
d0 = _mm_unpacklo_epi8(d0, zero); \
in_x = _mm_add_epi16(in_x, d0); \
in_x = _mm_packus_epi16(in_x, in_x); \
_mm_storel_epi64((__m128i *)(dest), in_x); \
dest += stride; \
}
RECON_AND_STORE(dest, in0);
RECON_AND_STORE(dest, in1);
RECON_AND_STORE(dest, in2);
......
......@@ -105,10 +105,6 @@ void vp9_add_residual_8x8_c(const int16_t *diff, uint8_t *dest, int stride) {
add_residual(diff, dest, stride, 8, 8);
}
void vp9_add_residual_16x16_c(const int16_t *diff, uint8_t *dest, int stride) {
add_residual(diff, dest, stride, 16, 16);
}
static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
int width, int height) {
int r, c;
......@@ -260,19 +256,14 @@ void vp9_iht_add_16x16_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
if (tx_type == DCT_DCT) {
vp9_idct_add_16x16(input, dest, stride, eob);
} else {
DECLARE_ALIGNED_ARRAY(16, int16_t, output, 256);
if (eob > 0) {
vp9_short_iht16x16(input, output, 16, tx_type);
vp9_short_iht16x16_add(input, dest, stride, tx_type);
vpx_memset(input, 0, 512);
vp9_add_residual_16x16(output, dest, stride);
}
}
}
void vp9_idct_add_16x16_c(int16_t *input, uint8_t *dest, int stride, int eob) {
DECLARE_ALIGNED_ARRAY(16, int16_t, output, 256);
/* The calculation can be simplified if there are not many non-zero dct
* coefficients. Use eobs to separate different cases. */
if (eob) {
......@@ -288,21 +279,15 @@ void vp9_idct_add_16x16_c(int16_t *input, uint8_t *dest, int stride, int eob) {
vp9_add_constant_residual_16x16(out, dest, stride);
#if !CONFIG_SCATTERSCAN
} else if (eob <= 10) {
// the idct halves ( >> 1) the pitch
vp9_short_idct10_16x16(input, output, 32);
vp9_short_idct10_16x16_add(input, dest, stride);
input[0] = input[1] = input[2] = input[3] = 0;
input[16] = input[17] = input[18] = 0;
input[32] = input[33] = 0;
input[48] = 0;
vp9_add_residual_16x16(output, dest, stride);
#endif
} else {
// the idct halves ( >> 1) the pitch
vp9_short_idct16x16(input, output, 16 << 1);
vp9_short_idct16x16_add(input, dest, stride);
vpx_memset(input, 0, 512);
vp9_add_residual_16x16(output, dest, stride);
}
}
}
......
......@@ -122,65 +122,6 @@ void vp9_add_residual_8x8_sse2(const int16_t *diff, uint8_t *dest, int stride) {
_mm_storel_epi64((__m128i *)(dest + 7 * stride), p6);
}
void vp9_add_residual_16x16_sse2(const int16_t *diff, uint8_t *dest,
int stride) {
const int width = 16;
int i = 4;
const __m128i zero = _mm_setzero_si128();
// Diff data
__m128i d0, d1, d2, d3, d4, d5, d6, d7;
__m128i p0, p1, p2, p3, p4, p5, p6, p7;
do {
d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));
d1 = _mm_load_si128((const __m128i *)(diff + 0 * width + 8));
d2 = _mm_load_si128((const __m128i *)(diff + 1 * width));
d3 = _mm_load_si128((const __m128i *)(diff + 1 * width + 8));
d4 = _mm_load_si128((const __m128i *)(diff + 2 * width));
d5 = _mm_load_si128((const __m128i *)(diff + 2 * width + 8));
d6 = _mm_load_si128((const __m128i *)(diff + 3 * width));
d7 = _mm_load_si128((const __m128i *)(diff + 3 * width + 8));
// Prediction data.
p1 = _mm_load_si128((const __m128i *)(dest + 0 * stride));
p3 = _mm_load_si128((const __m128i *)(dest + 1 * stride));
p5 = _mm_load_si128((const __m128i *)(dest + 2 * stride));
p7 = _mm_load_si128((const __m128i *)(dest + 3 * stride));
p0 = _mm_unpacklo_epi8(p1, zero);
p1 = _mm_unpackhi_epi8(p1, zero);
p2 = _mm_unpacklo_epi8(p3, zero);
p3 = _mm_unpackhi_epi8(p3, zero);
p4 = _mm_unpacklo_epi8(p5, zero);
p5 = _mm_unpackhi_epi8(p5, zero);
p6 = _mm_unpacklo_epi8(p7, zero);
p7 = _mm_unpackhi_epi8(p7, zero);
p0 = _mm_add_epi16(p0, d0);
p1 = _mm_add_epi16(p1, d1);
p2 = _mm_add_epi16(p2, d2);
p3 = _mm_add_epi16(p3, d3);
p4 = _mm_add_epi16(p4, d4);
p5 = _mm_add_epi16(p5, d5);
p6 = _mm_add_epi16(p6, d6);
p7 = _mm_add_epi16(p7, d7);
p0 = _mm_packus_epi16(p0, p1);
p1 = _mm_packus_epi16(p2, p3);
p2 = _mm_packus_epi16(p4, p5);
p3 = _mm_packus_epi16(p6, p7);
_mm_store_si128((__m128i *)(dest + 0 * stride), p0);
_mm_store_si128((__m128i *)(dest + 1 * stride), p1);
_mm_store_si128((__m128i *)(dest + 2 * stride), p2);
_mm_store_si128((__m128i *)(dest + 3 * stride), p3);
diff += 4 * width;
dest += 4 * stride;
} while (--i);
}
void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,
int stride) {
uint8_t abs_diff;
......
......@@ -522,11 +522,12 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
case TX_16X16:
tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
if (tx_type == DCT_DCT) {
vp9_short_idct16x16(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
diff, bw * 2);
vp9_short_idct16x16_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
block, 16), dst, xd->plane[plane].dst.stride);
} else {
vp9_short_iht16x16(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
diff, bw, tx_type);
vp9_short_iht16x16_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
block, 16), dst, xd->plane[plane].dst.stride,
tx_type);
}
*wip_txfrm_size = 16;
break;
......@@ -605,7 +606,7 @@ void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
if (wip_txfrm_size < 32)
if (wip_txfrm_size < 16)
vp9_recon_sbuv(xd, bsize);
}
......@@ -627,13 +628,13 @@ void vp9_encode_sb(VP9_COMMON *const cm, MACROBLOCK *x,
// wip version... will use foreach_transformed_block when done
foreach_transformed_block_in_plane(xd, bsize, 0,
encode_block, &arg);
if (wip_txfrm_size < 32)
if (wip_txfrm_size < 16)
vp9_recon_sby(xd, bsize);
wip_txfrm_size = 0;
foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
if (wip_txfrm_size < 32)
if (wip_txfrm_size < 16)
vp9_recon_sbuv(xd, bsize);
#endif
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment