Commit 794a7bed authored by Scott LaVarnway's avatar Scott LaVarnway

WIP: 8x8 idct/recon merge

This patch eliminates the intermediate diff buffer usage by
combining the short idct and the add residual into one function.
The encoder can use the same code as well.

Change-Id: Iacfd57324fbe2b7beca5d7f3dcae25c976e67f45
parent a272ff25
......@@ -16,6 +16,7 @@
extern "C" {
#include "vp9_rtcd.h"
void vp9_short_idct8x8_add_c(short *input, uint8_t *output, int pitch);
}
#include "acm_random.h"
......@@ -100,11 +101,15 @@ TEST(VP9Fdct8x8Test, RoundTripErrorCheck) {
for (int i = 0; i < count_test_block; ++i) {
int16_t test_input_block[64];
int16_t test_temp_block[64];
int16_t test_output_block[64];
uint8_t dst[64], src[64];
for (int j = 0; j < 64; ++j) {
src[j] = rnd.Rand8();
dst[j] = rnd.Rand8();
}
// Initialize a test block with input range [-255, 255].
for (int j = 0; j < 64; ++j)
test_input_block[j] = rnd.Rand8() - rnd.Rand8();
test_input_block[j] = src[j] - dst[j];
const int pitch = 16;
vp9_short_fdct8x8_c(test_input_block, test_temp_block, pitch);
......@@ -119,10 +124,10 @@ TEST(VP9Fdct8x8Test, RoundTripErrorCheck) {
test_temp_block[j] *= 4;
}
}
vp9_short_idct8x8_c(test_temp_block, test_output_block, pitch);
vp9_short_idct8x8_add_c(test_temp_block, dst, 8);
for (int j = 0; j < 64; ++j) {
const int diff = test_input_block[j] - test_output_block[j];
const int diff = dst[j] - src[j];
const int error = diff * diff;
if (max_error < error)
max_error = error;
......@@ -145,18 +150,22 @@ TEST(VP9Fdct8x8Test, ExtremalCheck) {
for (int i = 0; i < count_test_block; ++i) {
int16_t test_input_block[64];
int16_t test_temp_block[64];
int16_t test_output_block[64];
uint8_t dst[64], src[64];
// Initialize a test block with input range {-255, 255}.
for (int j = 0; j < 64; ++j) {
src[j] = rnd.Rand8() % 2 ? 255 : 0;
dst[j] = src[j] > 0 ? 0 : 255;
}
// Initialize a test block with input range [-255, 255].
for (int j = 0; j < 64; ++j)
test_input_block[j] = rnd.Rand8() % 2 ? 255 : -256;
test_input_block[j] = src[j] - dst[j];
const int pitch = 16;
vp9_short_fdct8x8_c(test_input_block, test_temp_block, pitch);
vp9_short_idct8x8_c(test_temp_block, test_output_block, pitch);
vp9_short_idct8x8_add_c(test_temp_block, dst, 8);
for (int j = 0; j < 64; ++j) {
const int diff = test_input_block[j] - test_output_block[j];
const int diff = dst[j] - src[j];
const int error = diff * diff;
if (max_error < error)
max_error = error;
......
......@@ -112,20 +112,23 @@ TEST(VP9Idct8x8Test, AccuracyCheck) {
const int count_test_block = 10000;
for (int i = 0; i < count_test_block; ++i) {
int16_t input[64], coeff[64];
int16_t output_c[64];
double output_r[64];
uint8_t dst[64], src[64];
for (int j = 0; j < 64; ++j) {
src[j] = rnd.Rand8();
dst[j] = rnd.Rand8();
}
// Initialize a test block with input range [-255, 255].
for (int j = 0; j < 64; ++j)
input[j] = rnd.Rand8() - rnd.Rand8();
input[j] = src[j] - dst[j];
const int pitch = 16;
reference_dct_2d(input, output_r);
for (int j = 0; j < 64; ++j)
coeff[j] = round(output_r[j]);
vp9_short_idct8x8_c(coeff, output_c, pitch);
vp9_short_idct8x8_add_c(coeff, dst, 8);
for (int j = 0; j < 64; ++j) {
const int diff = output_c[j] -input[j];
const int diff = dst[j] - src[j];
const int error = diff * diff;
EXPECT_GE(1, error)
<< "Error: 8x8 FDCT/IDCT has error " << error
......
......@@ -219,27 +219,27 @@ static void idct8_1d(int16_t *input, int16_t *output) {
output[7] = step1[0] - step1[7];
}
void vp9_short_idct8x8_c(int16_t *input, int16_t *output, int pitch) {
void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
int16_t out[8 * 8];
int16_t *outptr = out;
const int half_pitch = pitch >> 1;
int i, j;
int16_t temp_in[8], temp_out[8];
// Rows
// First transform rows
for (i = 0; i < 8; ++i) {
idct8_1d(input, outptr);
input += 8;
outptr += 8;
}
// Columns
// Then transform columns
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j)
temp_in[j] = out[j * 8 + i];
idct8_1d(temp_in, temp_out);
for (j = 0; j < 8; ++j)
output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);
dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+ dest[j * dest_stride + i]);
}
}
......@@ -400,8 +400,8 @@ static const transform_2d IHT_8[] = {
{ iadst8_1d, iadst8_1d } // ADST_ADST = 3
};
void vp9_short_iht8x8_c(int16_t *input, int16_t *output,
int pitch, int tx_type) {
void vp9_short_iht8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride,
int tx_type) {
int i, j;
int16_t out[8 * 8];
int16_t *outptr = out;
......@@ -421,14 +421,14 @@ void vp9_short_iht8x8_c(int16_t *input, int16_t *output,
temp_in[j] = out[j * 8 + i];
ht.cols(temp_in, temp_out);
for (j = 0; j < 8; ++j)
output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);
}
dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+ dest[j * dest_stride + i]); }
}
void vp9_short_idct10_8x8_c(int16_t *input, int16_t *output, int pitch) {
void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest,
int dest_stride) {
int16_t out[8 * 8];
int16_t *outptr = out;
const int half_pitch = pitch >> 1;
int i, j;
int16_t temp_in[8], temp_out[8];
......@@ -447,7 +447,8 @@ void vp9_short_idct10_8x8_c(int16_t *input, int16_t *output, int pitch) {
temp_in[j] = out[j * 8 + i];
idct8_1d(temp_in, temp_out);
for (j = 0; j < 8; ++j)
output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);
dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+ dest[j * dest_stride + i]);
}
}
......
......@@ -88,9 +88,6 @@ if [ "$CONFIG_VP9_DECODER" = "yes" ]; then
prototype void vp9_add_residual_4x4 "const int16_t *diff, uint8_t *dest, int stride"
specialize vp9_add_residual_4x4 sse2
prototype void vp9_add_residual_8x8 "const int16_t *diff, uint8_t *dest, int stride"
specialize vp9_add_residual_8x8 sse2
prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"
specialize vp9_add_constant_residual_8x8 sse2
......@@ -188,11 +185,11 @@ specialize vp9_short_idct4x4_1
prototype void vp9_short_idct4x4 "int16_t *input, int16_t *output, int pitch"
specialize vp9_short_idct4x4 sse2
prototype void vp9_short_idct8x8 "int16_t *input, int16_t *output, int pitch"
specialize vp9_short_idct8x8 sse2
prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct8x8_add sse2
prototype void vp9_short_idct10_8x8 "int16_t *input, int16_t *output, int pitch"
specialize vp9_short_idct10_8x8 sse2
prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct10_8x8_add sse2
prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output"
specialize vp9_short_idct1_8x8
......@@ -215,8 +212,8 @@ specialize vp9_short_idct1_32x32
prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct10_32x32_add
prototype void vp9_short_iht8x8 "int16_t *input, int16_t *output, int pitch, int tx_type"
specialize vp9_short_iht8x8
prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
specialize vp9_short_iht8x8_add
prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type"
specialize vp9_short_iht4x4
......
......@@ -403,8 +403,18 @@ void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {
in6 = _mm_subs_epi16(stp1_1, stp1_6); \
in7 = _mm_subs_epi16(stp1_0, stp2_7);
void vp9_short_idct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
const int half_pitch = pitch >> 1;
#define RECON_AND_STORE(dest, in_x) \
{ \
__m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
d0 = _mm_unpacklo_epi8(d0, zero); \
in_x = _mm_add_epi16(in_x, d0); \
in_x = _mm_packus_epi16(in_x, in_x); \
_mm_storel_epi64((__m128i *)(dest), in_x); \
dest += stride; \
}
void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
const __m128i zero = _mm_setzero_si128();
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<4);
const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
......@@ -461,19 +471,17 @@ void vp9_short_idct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
in6 = _mm_srai_epi16(in6, 5);
in7 = _mm_srai_epi16(in7, 5);
// Store results
_mm_store_si128((__m128i *)output, in0);
_mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
_mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
_mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
_mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
_mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
_mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
_mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
RECON_AND_STORE(dest, in0);
RECON_AND_STORE(dest, in1);
RECON_AND_STORE(dest, in2);
RECON_AND_STORE(dest, in3);
RECON_AND_STORE(dest, in4);
RECON_AND_STORE(dest, in5);
RECON_AND_STORE(dest, in6);
RECON_AND_STORE(dest, in7);
}
void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) {
const int half_pitch = pitch >> 1;
void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
const __m128i zero = _mm_setzero_si128();
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<4);
......@@ -612,15 +620,14 @@ void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) {
in6 = _mm_srai_epi16(in6, 5);
in7 = _mm_srai_epi16(in7, 5);
// Store results
_mm_store_si128((__m128i *)output, in0);
_mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
_mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
_mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
_mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
_mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
_mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
_mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
RECON_AND_STORE(dest, in0);
RECON_AND_STORE(dest, in1);
RECON_AND_STORE(dest, in2);
RECON_AND_STORE(dest, in3);
RECON_AND_STORE(dest, in4);
RECON_AND_STORE(dest, in5);
RECON_AND_STORE(dest, in6);
RECON_AND_STORE(dest, in7);
}
#define IDCT16x16_1D \
......@@ -752,16 +759,6 @@ void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) {
stp2_10, stp2_13, stp2_11, stp2_12) \
}
#define RECON_AND_STORE(dest, in_x) \
{ \
__m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
d0 = _mm_unpacklo_epi8(d0, zero); \
in_x = _mm_add_epi16(in_x, d0); \
in_x = _mm_packus_epi16(in_x, in_x); \
_mm_storel_epi64((__m128i *)(dest), in_x); \
dest += stride; \
}
void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<5);
......
......@@ -101,10 +101,6 @@ void vp9_add_residual_4x4_c(const int16_t *diff, uint8_t *dest, int stride) {
add_residual(diff, dest, stride, 4, 4);
}
void vp9_add_residual_8x8_c(const int16_t *diff, uint8_t *dest, int stride) {
add_residual(diff, dest, stride, 8, 8);
}
static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
int width, int height) {
int r, c;
......@@ -151,11 +147,8 @@ void vp9_iht_add_8x8_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
vp9_idct_add_8x8(input, dest, stride, eob);
} else {
if (eob > 0) {
DECLARE_ALIGNED_ARRAY(16, int16_t, output, 64);
vp9_short_iht8x8(input, output, 8, tx_type);
vp9_short_iht8x8_add(input, dest, stride, tx_type);
vpx_memset(input, 0, 128);
vp9_add_residual_8x8(output, dest, stride);
}
}
}
......@@ -210,8 +203,6 @@ void vp9_dc_idct_add_lossless_c(int16_t *input, uint8_t *dest,
}
void vp9_idct_add_8x8_c(int16_t *input, uint8_t *dest, int stride, int eob) {
DECLARE_ALIGNED_ARRAY(16, int16_t, output, 64);
// If dc is 1, then input[0] is the reconstructed value, do not need
// dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
......@@ -233,20 +224,15 @@ void vp9_idct_add_8x8_c(int16_t *input, uint8_t *dest, int stride, int eob) {
vp9_add_constant_residual_8x8(out, dest, stride);
#if !CONFIG_SCATTERSCAN
} else if (eob <= 10) {
vp9_short_idct10_8x8(input, output, 16);
vp9_short_idct10_8x8_add(input, dest, stride);
input[0] = input[1] = input[2] = input[3] = 0;
input[8] = input[9] = input[10] = 0;
input[16] = input[17] = 0;
input[24] = 0;
vp9_add_residual_8x8(output, dest, stride);
#endif
} else {
// the idct halves ( >> 1) the pitch
vp9_short_idct8x8(input, output, 8 << 1);
vp9_short_idct8x8_add(input, dest, stride);
vpx_memset(input, 0, 128);
vp9_add_residual_8x8(output, dest, stride);
}
}
}
......
......@@ -58,70 +58,6 @@ void vp9_add_residual_4x4_sse2(const int16_t *diff, uint8_t *dest, int stride) {
*(int *)dest = _mm_cvtsi128_si32(p2);
}
void vp9_add_residual_8x8_sse2(const int16_t *diff, uint8_t *dest, int stride) {
const int width = 8;
const __m128i zero = _mm_setzero_si128();
// Diff data
const __m128i d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));
const __m128i d1 = _mm_load_si128((const __m128i *)(diff + 1 * width));
const __m128i d2 = _mm_load_si128((const __m128i *)(diff + 2 * width));
const __m128i d3 = _mm_load_si128((const __m128i *)(diff + 3 * width));
const __m128i d4 = _mm_load_si128((const __m128i *)(diff + 4 * width));
const __m128i d5 = _mm_load_si128((const __m128i *)(diff + 5 * width));
const __m128i d6 = _mm_load_si128((const __m128i *)(diff + 6 * width));
const __m128i d7 = _mm_load_si128((const __m128i *)(diff + 7 * width));
// Prediction data.
__m128i p0 = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
__m128i p1 = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride));
__m128i p2 = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride));
__m128i p3 = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride));
__m128i p4 = _mm_loadl_epi64((const __m128i *)(dest + 4 * stride));
__m128i p5 = _mm_loadl_epi64((const __m128i *)(dest + 5 * stride));
__m128i p6 = _mm_loadl_epi64((const __m128i *)(dest + 6 * stride));
__m128i p7 = _mm_loadl_epi64((const __m128i *)(dest + 7 * stride));
p0 = _mm_unpacklo_epi8(p0, zero);
p1 = _mm_unpacklo_epi8(p1, zero);
p2 = _mm_unpacklo_epi8(p2, zero);
p3 = _mm_unpacklo_epi8(p3, zero);
p4 = _mm_unpacklo_epi8(p4, zero);
p5 = _mm_unpacklo_epi8(p5, zero);
p6 = _mm_unpacklo_epi8(p6, zero);
p7 = _mm_unpacklo_epi8(p7, zero);
p0 = _mm_add_epi16(p0, d0);
p1 = _mm_add_epi16(p1, d1);
p2 = _mm_add_epi16(p2, d2);
p3 = _mm_add_epi16(p3, d3);
p4 = _mm_add_epi16(p4, d4);
p5 = _mm_add_epi16(p5, d5);
p6 = _mm_add_epi16(p6, d6);
p7 = _mm_add_epi16(p7, d7);
p0 = _mm_packus_epi16(p0, p1);
p2 = _mm_packus_epi16(p2, p3);
p4 = _mm_packus_epi16(p4, p5);
p6 = _mm_packus_epi16(p6, p7);
_mm_storel_epi64((__m128i *)(dest + 0 * stride), p0);
p0 = _mm_srli_si128(p0, 8);
_mm_storel_epi64((__m128i *)(dest + 1 * stride), p0);
_mm_storel_epi64((__m128i *)(dest + 2 * stride), p2);
p2 = _mm_srli_si128(p2, 8);
_mm_storel_epi64((__m128i *)(dest + 3 * stride), p2);
_mm_storel_epi64((__m128i *)(dest + 4 * stride), p4);
p4 = _mm_srli_si128(p4, 8);
_mm_storel_epi64((__m128i *)(dest + 5 * stride), p4);
_mm_storel_epi64((__m128i *)(dest + 6 * stride), p6);
p6 = _mm_srli_si128(p6, 8);
_mm_storel_epi64((__m128i *)(dest + 7 * stride), p6);
}
void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,
int stride) {
uint8_t abs_diff;
......
......@@ -534,11 +534,12 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
case TX_8X8:
tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
if (tx_type == DCT_DCT) {
vp9_short_idct8x8(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
diff, bw * 2);
vp9_short_idct8x8_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
block, 16), dst, xd->plane[plane].dst.stride);
} else {
vp9_short_iht8x8(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
diff, bw, tx_type);
vp9_short_iht8x8_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
block, 16), dst, xd->plane[plane].dst.stride,
tx_type);
}
*wip_txfrm_size = 8;
break;
......@@ -589,7 +590,7 @@ void vp9_encode_sby(VP9_COMMON *const cm, MACROBLOCK *x,
foreach_transformed_block_in_plane(xd, bsize, 0,
encode_block, &arg);
if (wip_txfrm_size < 32)
if (wip_txfrm_size < 8)
vp9_recon_sby(xd, bsize);
}
......@@ -606,7 +607,7 @@ void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
if (wip_txfrm_size < 16)
if (wip_txfrm_size < 8)
vp9_recon_sbuv(xd, bsize);
}
......@@ -628,13 +629,13 @@ void vp9_encode_sb(VP9_COMMON *const cm, MACROBLOCK *x,
// wip version... will use foreach_transformed_block when done
foreach_transformed_block_in_plane(xd, bsize, 0,
encode_block, &arg);
if (wip_txfrm_size < 16)
if (wip_txfrm_size < 8)
vp9_recon_sby(xd, bsize);
wip_txfrm_size = 0;
foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
if (wip_txfrm_size < 16)
if (wip_txfrm_size < 8)
vp9_recon_sbuv(xd, bsize);
#endif
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment