Commit a272ff25 authored by Scott LaVarnway's avatar Scott LaVarnway

WIP: 16x16 idct/recon merge

This patch eliminates the intermediate diff buffer usage by
combining the short idct and the add residual into one function.
The encoder can use the same code as well.

Change-Id: Iea7976b22b1927d24b8004d2a3fddae7ecca3ba1
parent 2cf0d4be
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
extern "C" { extern "C" {
#include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_entropy.h"
#include "vp9_rtcd.h" #include "vp9_rtcd.h"
void vp9_short_idct16x16_add_c(short *input, uint8_t *output, int pitch);
} }
#include "acm_random.h" #include "acm_random.h"
...@@ -269,19 +270,23 @@ TEST(VP9Idct16x16Test, AccuracyCheck) { ...@@ -269,19 +270,23 @@ TEST(VP9Idct16x16Test, AccuracyCheck) {
const int count_test_block = 1000; const int count_test_block = 1000;
for (int i = 0; i < count_test_block; ++i) { for (int i = 0; i < count_test_block; ++i) {
int16_t in[256], coeff[256]; int16_t in[256], coeff[256];
int16_t out_c[256]; uint8_t dst[256], src[256];
double out_r[256]; double out_r[256];
for (int j = 0; j < 256; ++j) {
src[j] = rnd.Rand8();
dst[j] = rnd.Rand8();
}
// Initialize a test block with input range [-255, 255]. // Initialize a test block with input range [-255, 255].
for (int j = 0; j < 256; ++j) for (int j = 0; j < 256; ++j)
in[j] = rnd.Rand8() - rnd.Rand8(); in[j] = src[j] - dst[j];
reference_16x16_dct_2d(in, out_r); reference_16x16_dct_2d(in, out_r);
for (int j = 0; j < 256; j++) for (int j = 0; j < 256; j++)
coeff[j] = round(out_r[j]); coeff[j] = round(out_r[j]);
vp9_short_idct16x16_c(coeff, out_c, 32); vp9_short_idct16x16_add_c(coeff, dst, 16);
for (int j = 0; j < 256; ++j) { for (int j = 0; j < 256; ++j) {
const int diff = out_c[j] - in[j]; const int diff = dst[j] - src[j];
const int error = diff * diff; const int error = diff * diff;
EXPECT_GE(1, error) EXPECT_GE(1, error)
<< "Error: 16x16 IDCT has error " << error << "Error: 16x16 IDCT has error " << error
...@@ -289,7 +294,7 @@ TEST(VP9Idct16x16Test, AccuracyCheck) { ...@@ -289,7 +294,7 @@ TEST(VP9Idct16x16Test, AccuracyCheck) {
} }
} }
} }
#if 1
// we need enable fdct test once we re-do the 16 point fdct. // we need enable fdct test once we re-do the 16 point fdct.
TEST(VP9Fdct16x16Test, AccuracyCheck) { TEST(VP9Fdct16x16Test, AccuracyCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed()); ACMRandom rnd(ACMRandom::DeterministicSeed());
...@@ -299,18 +304,22 @@ TEST(VP9Fdct16x16Test, AccuracyCheck) { ...@@ -299,18 +304,22 @@ TEST(VP9Fdct16x16Test, AccuracyCheck) {
for (int i = 0; i < count_test_block; ++i) { for (int i = 0; i < count_test_block; ++i) {
int16_t test_input_block[256]; int16_t test_input_block[256];
int16_t test_temp_block[256]; int16_t test_temp_block[256];
int16_t test_output_block[256]; uint8_t dst[256], src[256];
for (int j = 0; j < 256; ++j) {
src[j] = rnd.Rand8();
dst[j] = rnd.Rand8();
}
// Initialize a test block with input range [-255, 255]. // Initialize a test block with input range [-255, 255].
for (int j = 0; j < 256; ++j) for (int j = 0; j < 256; ++j)
test_input_block[j] = rnd.Rand8() - rnd.Rand8(); test_input_block[j] = src[j] - dst[j];
const int pitch = 32; const int pitch = 32;
vp9_short_fdct16x16_c(test_input_block, test_temp_block, pitch); vp9_short_fdct16x16_c(test_input_block, test_temp_block, pitch);
vp9_short_idct16x16_c(test_temp_block, test_output_block, pitch); vp9_short_idct16x16_add_c(test_temp_block, dst, 16);
for (int j = 0; j < 256; ++j) { for (int j = 0; j < 256; ++j) {
const int diff = test_input_block[j] - test_output_block[j]; const int diff = dst[j] - src[j];
const int error = diff * diff; const int error = diff * diff;
if (max_error < error) if (max_error < error)
max_error = error; max_error = error;
...@@ -354,6 +363,4 @@ TEST(VP9Fdct16x16Test, CoeffSizeCheck) { ...@@ -354,6 +363,4 @@ TEST(VP9Fdct16x16Test, CoeffSizeCheck) {
} }
} }
} }
#endif
} // namespace } // namespace
...@@ -621,10 +621,9 @@ static void idct16_1d(int16_t *input, int16_t *output) { ...@@ -621,10 +621,9 @@ static void idct16_1d(int16_t *input, int16_t *output) {
output[15] = step2[0] - step2[15]; output[15] = step2[0] - step2[15];
} }
void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) { void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
int16_t out[16 * 16]; int16_t out[16 * 16];
int16_t *outptr = out; int16_t *outptr = out;
const int half_pitch = pitch >> 1;
int i, j; int i, j;
int16_t temp_in[16], temp_out[16]; int16_t temp_in[16], temp_out[16];
...@@ -641,7 +640,8 @@ void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) { ...@@ -641,7 +640,8 @@ void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {
temp_in[j] = out[j * 16 + i]; temp_in[j] = out[j * 16 + i];
idct16_1d(temp_in, temp_out); idct16_1d(temp_in, temp_out);
for (j = 0; j < 16; ++j) for (j = 0; j < 16; ++j)
output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6); dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ dest[j * dest_stride + i]);
} }
} }
...@@ -823,8 +823,8 @@ static const transform_2d IHT_16[] = { ...@@ -823,8 +823,8 @@ static const transform_2d IHT_16[] = {
{ iadst16_1d, iadst16_1d } // ADST_ADST = 3 { iadst16_1d, iadst16_1d } // ADST_ADST = 3
}; };
void vp9_short_iht16x16_c(int16_t *input, int16_t *output, void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride,
int pitch, int tx_type) { int tx_type) {
int i, j; int i, j;
int16_t out[16 * 16]; int16_t out[16 * 16];
int16_t *outptr = out; int16_t *outptr = out;
...@@ -844,37 +844,37 @@ void vp9_short_iht16x16_c(int16_t *input, int16_t *output, ...@@ -844,37 +844,37 @@ void vp9_short_iht16x16_c(int16_t *input, int16_t *output,
temp_in[j] = out[j * 16 + i]; temp_in[j] = out[j * 16 + i];
ht.cols(temp_in, temp_out); ht.cols(temp_in, temp_out);
for (j = 0; j < 16; ++j) for (j = 0; j < 16; ++j)
output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6); dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
} + dest[j * dest_stride + i]); }
} }
void vp9_short_idct10_16x16_c(int16_t *input, int16_t *output, int pitch) { void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest,
int16_t out[16 * 16]; int dest_stride) {
int16_t *outptr = out; int16_t out[16 * 16];
const int half_pitch = pitch >> 1; int16_t *outptr = out;
int i, j; int i, j;
int16_t temp_in[16], temp_out[16]; int16_t temp_in[16], temp_out[16];
/* First transform rows. Since all non-zero dct coefficients are in /* First transform rows. Since all non-zero dct coefficients are in
* upper-left 4x4 area, we only need to calculate first 4 rows here. * upper-left 4x4 area, we only need to calculate first 4 rows here.
*/ */
vpx_memset(out, 0, sizeof(out)); vpx_memset(out, 0, sizeof(out));
for (i = 0; i < 4; ++i) { for (i = 0; i < 4; ++i) {
idct16_1d(input, outptr); idct16_1d(input, outptr);
input += 16; input += 16;
outptr += 16; outptr += 16;
} }
// Then transform columns
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j)
temp_in[j] = out[j*16 + i];
idct16_1d(temp_in, temp_out);
for (j = 0; j < 16; ++j)
output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
}
}
// Then transform columns
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j)
temp_in[j] = out[j*16 + i];
idct16_1d(temp_in, temp_out);
for (j = 0; j < 16; ++j)
dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ dest[j * dest_stride + i]);
}
}
void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output) { void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output) {
int16_t out = dct_const_round_shift(input[0] * cospi_16_64); int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
......
...@@ -91,9 +91,6 @@ specialize vp9_add_residual_4x4 sse2 ...@@ -91,9 +91,6 @@ specialize vp9_add_residual_4x4 sse2
prototype void vp9_add_residual_8x8 "const int16_t *diff, uint8_t *dest, int stride" prototype void vp9_add_residual_8x8 "const int16_t *diff, uint8_t *dest, int stride"
specialize vp9_add_residual_8x8 sse2 specialize vp9_add_residual_8x8 sse2
prototype void vp9_add_residual_16x16 "const int16_t *diff, uint8_t *dest, int stride"
specialize vp9_add_residual_16x16 sse2
prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride" prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"
specialize vp9_add_constant_residual_8x8 sse2 specialize vp9_add_constant_residual_8x8 sse2
...@@ -200,11 +197,11 @@ specialize vp9_short_idct10_8x8 sse2 ...@@ -200,11 +197,11 @@ specialize vp9_short_idct10_8x8 sse2
prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output" prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output"
specialize vp9_short_idct1_8x8 specialize vp9_short_idct1_8x8
prototype void vp9_short_idct16x16 "int16_t *input, int16_t *output, int pitch" prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct16x16 sse2 specialize vp9_short_idct16x16_add sse2
prototype void vp9_short_idct10_16x16 "int16_t *input, int16_t *output, int pitch" prototype void vp9_short_idct10_16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct10_16x16 sse2 specialize vp9_short_idct10_16x16_add sse2
prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output" prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output"
specialize vp9_short_idct1_16x16 specialize vp9_short_idct1_16x16
...@@ -224,8 +221,8 @@ specialize vp9_short_iht8x8 ...@@ -224,8 +221,8 @@ specialize vp9_short_iht8x8
prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type" prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type"
specialize vp9_short_iht4x4 specialize vp9_short_iht4x4
prototype void vp9_short_iht16x16 "int16_t *input, int16_t *output, int pitch, int tx_type" prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type"
specialize vp9_short_iht16x16 specialize vp9_short_iht16x16_add
prototype void vp9_idct4_1d "int16_t *input, int16_t *output" prototype void vp9_idct4_1d "int16_t *input, int16_t *output"
specialize vp9_idct4_1d sse2 specialize vp9_idct4_1d sse2
......
...@@ -752,8 +752,17 @@ void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) { ...@@ -752,8 +752,17 @@ void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) {
stp2_10, stp2_13, stp2_11, stp2_12) \ stp2_10, stp2_13, stp2_11, stp2_12) \
} }
void vp9_short_idct16x16_sse2(int16_t *input, int16_t *output, int pitch) { #define RECON_AND_STORE(dest, in_x) \
const int half_pitch = pitch >> 1; { \
__m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
d0 = _mm_unpacklo_epi8(d0, zero); \
in_x = _mm_add_epi16(in_x, d0); \
in_x = _mm_packus_epi16(in_x, in_x); \
_mm_storel_epi64((__m128i *)(dest), in_x); \
dest += stride; \
}
void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<5); const __m128i final_rounding = _mm_set1_epi16(1<<5);
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
...@@ -938,31 +947,30 @@ void vp9_short_idct16x16_sse2(int16_t *input, int16_t *output, int pitch) { ...@@ -938,31 +947,30 @@ void vp9_short_idct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
in14 = _mm_srai_epi16(in14, 6); in14 = _mm_srai_epi16(in14, 6);
in15 = _mm_srai_epi16(in15, 6); in15 = _mm_srai_epi16(in15, 6);
// Store results RECON_AND_STORE(dest, in0);
_mm_store_si128((__m128i *)output, in0); RECON_AND_STORE(dest, in1);
_mm_store_si128((__m128i *)(output + half_pitch * 1), in1); RECON_AND_STORE(dest, in2);
_mm_store_si128((__m128i *)(output + half_pitch * 2), in2); RECON_AND_STORE(dest, in3);
_mm_store_si128((__m128i *)(output + half_pitch * 3), in3); RECON_AND_STORE(dest, in4);
_mm_store_si128((__m128i *)(output + half_pitch * 4), in4); RECON_AND_STORE(dest, in5);
_mm_store_si128((__m128i *)(output + half_pitch * 5), in5); RECON_AND_STORE(dest, in6);
_mm_store_si128((__m128i *)(output + half_pitch * 6), in6); RECON_AND_STORE(dest, in7);
_mm_store_si128((__m128i *)(output + half_pitch * 7), in7); RECON_AND_STORE(dest, in8);
_mm_store_si128((__m128i *)(output + half_pitch * 8), in8); RECON_AND_STORE(dest, in9);
_mm_store_si128((__m128i *)(output + half_pitch * 9), in9); RECON_AND_STORE(dest, in10);
_mm_store_si128((__m128i *)(output + half_pitch * 10), in10); RECON_AND_STORE(dest, in11);
_mm_store_si128((__m128i *)(output + half_pitch * 11), in11); RECON_AND_STORE(dest, in12);
_mm_store_si128((__m128i *)(output + half_pitch * 12), in12); RECON_AND_STORE(dest, in13);
_mm_store_si128((__m128i *)(output + half_pitch * 13), in13); RECON_AND_STORE(dest, in14);
_mm_store_si128((__m128i *)(output + half_pitch * 14), in14); RECON_AND_STORE(dest, in15);
_mm_store_si128((__m128i *)(output + half_pitch * 15), in15);
dest += 8 - (stride * 16);
output += 8;
} }
} }
} }
void vp9_short_idct10_16x16_sse2(int16_t *input, int16_t *output, int pitch) { void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest,
const int half_pitch = pitch >> 1; int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<5); const __m128i final_rounding = _mm_set1_epi16(1<<5);
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
...@@ -1007,7 +1015,6 @@ void vp9_short_idct10_16x16_sse2(int16_t *input, int16_t *output, int pitch) { ...@@ -1007,7 +1015,6 @@ void vp9_short_idct10_16x16_sse2(int16_t *input, int16_t *output, int pitch) {
stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int i; int i;
// 1-D idct. Load input data. // 1-D idct. Load input data.
in0 = _mm_load_si128((__m128i *)input); in0 = _mm_load_si128((__m128i *)input);
in8 = _mm_load_si128((__m128i *)(input + 8 * 1)); in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
...@@ -1298,24 +1305,24 @@ void vp9_short_idct10_16x16_sse2(int16_t *input, int16_t *output, int pitch) { ...@@ -1298,24 +1305,24 @@ void vp9_short_idct10_16x16_sse2(int16_t *input, int16_t *output, int pitch) {
in14 = _mm_srai_epi16(in14, 6); in14 = _mm_srai_epi16(in14, 6);
in15 = _mm_srai_epi16(in15, 6); in15 = _mm_srai_epi16(in15, 6);
// Store results RECON_AND_STORE(dest, in0);
_mm_store_si128((__m128i *)output, in0); RECON_AND_STORE(dest, in1);
_mm_store_si128((__m128i *)(output + half_pitch * 1), in1); RECON_AND_STORE(dest, in2);
_mm_store_si128((__m128i *)(output + half_pitch * 2), in2); RECON_AND_STORE(dest, in3);
_mm_store_si128((__m128i *)(output + half_pitch * 3), in3); RECON_AND_STORE(dest, in4);
_mm_store_si128((__m128i *)(output + half_pitch * 4), in4); RECON_AND_STORE(dest, in5);
_mm_store_si128((__m128i *)(output + half_pitch * 5), in5); RECON_AND_STORE(dest, in6);
_mm_store_si128((__m128i *)(output + half_pitch * 6), in6); RECON_AND_STORE(dest, in7);
_mm_store_si128((__m128i *)(output + half_pitch * 7), in7); RECON_AND_STORE(dest, in8);
_mm_store_si128((__m128i *)(output + half_pitch * 8), in8); RECON_AND_STORE(dest, in9);
_mm_store_si128((__m128i *)(output + half_pitch * 9), in9); RECON_AND_STORE(dest, in10);
_mm_store_si128((__m128i *)(output + half_pitch * 10), in10); RECON_AND_STORE(dest, in11);
_mm_store_si128((__m128i *)(output + half_pitch * 11), in11); RECON_AND_STORE(dest, in12);
_mm_store_si128((__m128i *)(output + half_pitch * 12), in12); RECON_AND_STORE(dest, in13);
_mm_store_si128((__m128i *)(output + half_pitch * 13), in13); RECON_AND_STORE(dest, in14);
_mm_store_si128((__m128i *)(output + half_pitch * 14), in14); RECON_AND_STORE(dest, in15);
_mm_store_si128((__m128i *)(output + half_pitch * 15), in15);
output += 8; dest += 8 - (stride * 16);
} }
} }
...@@ -1934,16 +1941,6 @@ void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) { ...@@ -1934,16 +1941,6 @@ void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
in30 = _mm_srai_epi16(in30, 6); in30 = _mm_srai_epi16(in30, 6);
in31 = _mm_srai_epi16(in31, 6); in31 = _mm_srai_epi16(in31, 6);
#define RECON_AND_STORE(dest, in_x) \
{ \
__m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
d0 = _mm_unpacklo_epi8(d0, zero); \
in_x = _mm_add_epi16(in_x, d0); \
in_x = _mm_packus_epi16(in_x, in_x); \
_mm_storel_epi64((__m128i *)(dest), in_x); \
dest += stride; \
}
RECON_AND_STORE(dest, in0); RECON_AND_STORE(dest, in0);
RECON_AND_STORE(dest, in1); RECON_AND_STORE(dest, in1);
RECON_AND_STORE(dest, in2); RECON_AND_STORE(dest, in2);
......
...@@ -105,10 +105,6 @@ void vp9_add_residual_8x8_c(const int16_t *diff, uint8_t *dest, int stride) { ...@@ -105,10 +105,6 @@ void vp9_add_residual_8x8_c(const int16_t *diff, uint8_t *dest, int stride) {
add_residual(diff, dest, stride, 8, 8); add_residual(diff, dest, stride, 8, 8);
} }
void vp9_add_residual_16x16_c(const int16_t *diff, uint8_t *dest, int stride) {
add_residual(diff, dest, stride, 16, 16);
}
static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride, static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
int width, int height) { int width, int height) {
int r, c; int r, c;
...@@ -260,19 +256,14 @@ void vp9_iht_add_16x16_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest, ...@@ -260,19 +256,14 @@ void vp9_iht_add_16x16_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
if (tx_type == DCT_DCT) { if (tx_type == DCT_DCT) {
vp9_idct_add_16x16(input, dest, stride, eob); vp9_idct_add_16x16(input, dest, stride, eob);
} else { } else {
DECLARE_ALIGNED_ARRAY(16, int16_t, output, 256);
if (eob > 0) { if (eob > 0) {
vp9_short_iht16x16(input, output, 16, tx_type); vp9_short_iht16x16_add(input, dest, stride, tx_type);
vpx_memset(input, 0, 512); vpx_memset(input, 0, 512);
vp9_add_residual_16x16(output, dest, stride);
} }
} }
} }
void vp9_idct_add_16x16_c(int16_t *input, uint8_t *dest, int stride, int eob) { void vp9_idct_add_16x16_c(int16_t *input, uint8_t *dest, int stride, int eob) {
DECLARE_ALIGNED_ARRAY(16, int16_t, output, 256);
/* The calculation can be simplified if there are not many non-zero dct /* The calculation can be simplified if there are not many non-zero dct
* coefficients. Use eobs to separate different cases. */ * coefficients. Use eobs to separate different cases. */
if (eob) { if (eob) {
...@@ -288,21 +279,15 @@ void vp9_idct_add_16x16_c(int16_t *input, uint8_t *dest, int stride, int eob) { ...@@ -288,21 +279,15 @@ void vp9_idct_add_16x16_c(int16_t *input, uint8_t *dest, int stride, int eob) {
vp9_add_constant_residual_16x16(out, dest, stride); vp9_add_constant_residual_16x16(out, dest, stride);
#if !CONFIG_SCATTERSCAN #if !CONFIG_SCATTERSCAN
} else if (eob <= 10) { } else if (eob <= 10) {
// the idct halves ( >> 1) the pitch vp9_short_idct10_16x16_add(input, dest, stride);
vp9_short_idct10_16x16(input, output, 32);
input[0] = input[1] = input[2] = input[3] = 0; input[0] = input[1] = input[2] = input[3] = 0;
input[16] = input[17] = input[18] = 0; input[16] = input[17] = input[18] = 0;
input[32] = input[33] = 0; input[32] = input[33] = 0;
input[48] = 0; input[48] = 0;
vp9_add_residual_16x16(output, dest, stride);
#endif #endif
} else { } else {
// the idct halves ( >> 1) the pitch vp9_short_idct16x16_add(input, dest, stride);
vp9_short_idct16x16(input, output, 16 << 1);
vpx_memset(input, 0, 512); vpx_memset(input, 0, 512);
vp9_add_residual_16x16(output, dest, stride);
} }
} }
} }
......
...@@ -122,65 +122,6 @@ void vp9_add_residual_8x8_sse2(const int16_t *diff, uint8_t *dest, int stride) { ...@@ -122,65 +122,6 @@ void vp9_add_residual_8x8_sse2(const int16_t *diff, uint8_t *dest, int stride) {
_mm_storel_epi64((__m128i *)(dest + 7 * stride), p6); _mm_storel_epi64((__m128i *)(dest + 7 * stride), p6);
} }
void vp9_add_residual_16x16_sse2(const int16_t *diff, uint8_t *dest,
int stride) {
const int width = 16;
int i = 4;
const __m128i zero = _mm_setzero_si128();
// Diff data
__m128i d0, d1, d2, d3, d4, d5, d6, d7;
__m128i p0, p1, p2, p3, p4, p5, p6, p7;
do {
d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));
d1 = _mm_load_si128((const __m128i *)(diff + 0 * width + 8));
d2 = _mm_load_si128((const __m128i *)(diff + 1 * width));
d3 = _mm_load_si128((const __m128i *)(diff + 1 * width + 8));
d4 = _mm_load_si128((const __m128i *)(diff + 2 * width));
d5 = _mm_load_si128((const __m128i *)(diff + 2 * width + 8));
d6 = _mm_load_si128((const __m128i *)(diff + 3 * width));
d7 = _mm_load_si128((const __m128i *)(diff + 3 * width + 8));
// Prediction data.
p1 = _mm_load_si128((const __m128i *)(dest + 0 * stride));
p3 = _mm_load_si128((const __m128i *)(dest + 1 * stride));
p5 = _mm_load_si128((const __m128i *)(dest + 2 * stride));
p7 = _mm_load_si128((const __m128i *)(dest + 3 * stride));
p0 = _mm_unpacklo_epi8(p1, zero);
p1 = _mm_unpackhi_epi8(p1, zero);
p2 = _mm_unpacklo_epi8(p3, zero);
p3 = _mm_unpackhi_epi8(p3, zero);
p4 = _mm_unpacklo_epi8(p5, zero);
p5 = _mm_unpackhi_epi8(p5, zero);
p6 = _mm_unpacklo_epi8(p7, zero);
p7 = _mm_unpackhi_epi8(p7, zero);
p0 = _mm_add_epi16(p0, d0);
p1 = _mm_add_epi16(p1, d1);
p2 = _mm_add_epi16(p2, d2);
p3 = _mm_add_epi16(p3, d3);
p4 = _mm_add_epi16(p4, d4);
p5 = _mm_add_epi16(p5, d5);
p6 = _mm_add_epi16(p6, d6);
p7 = _mm_add_epi16(p7, d7);
p0 = _mm_packus_epi16(p0, p1);
p1 = _mm_packus_epi16(p2, p3);
p2 = _mm_packus_epi16(p4, p5);
p3 = _mm_packus_epi16(p6, p7);
_mm_store_si128((__m128i *)(dest + 0 * stride), p0);
_mm_store_si128((__m128i *)(dest + 1 * stride), p1);
_mm_store_si128((__m128i *)(dest + 2 * stride), p2);
_mm_store_si128((__m128i *)(dest + 3 * stride), p3);
diff += 4 * width;
dest += 4 * stride;
} while (--i);
}
void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest, void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,
int stride) { int stride) {
uint8_t abs_diff; uint8_t abs_diff;
......
...@@ -522,11 +522,12 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize, ...@@ -522,11 +522,12 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
case TX_16X16: case TX_16X16:
tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT; tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
if (tx_type == DCT_DCT) { if (tx_type == DCT_DCT) {
vp9_short_idct16x16(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), vp9_short_idct16x16_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
diff, bw * 2); block, 16), dst, xd->plane[plane].dst.stride);
} else { } else {
vp9_short_iht16x16(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), vp9_short_iht16x16_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
diff, bw, tx_type); block, 16), dst, xd->plane[plane].dst.stride,
tx_type