Commit 1db63732 authored by Scott LaVarnway's avatar Scott LaVarnway Committed by Gerrit Code Review
Browse files

Merge "WIP: 4x4 idct/recon merge" into experimental

parents a2b9b744 ba48a111
...@@ -96,11 +96,15 @@ TEST(Vp9Fdct4x4Test, RoundTripErrorCheck) { ...@@ -96,11 +96,15 @@ TEST(Vp9Fdct4x4Test, RoundTripErrorCheck) {
for (int i = 0; i < count_test_block; ++i) { for (int i = 0; i < count_test_block; ++i) {
int16_t test_input_block[16]; int16_t test_input_block[16];
int16_t test_temp_block[16]; int16_t test_temp_block[16];
int16_t test_output_block[16]; uint8_t dst[16], src[16];
for (int j = 0; j < 16; ++j) {
src[j] = rnd.Rand8();
dst[j] = rnd.Rand8();
}
// Initialize a test block with input range [-255, 255]. // Initialize a test block with input range [-255, 255].
for (int j = 0; j < 16; ++j) for (int j = 0; j < 16; ++j)
test_input_block[j] = rnd.Rand8() - rnd.Rand8(); test_input_block[j] = src[j] - dst[j];
// TODO(Yaowu): this should be converted to a parameterized test // TODO(Yaowu): this should be converted to a parameterized test
// to test optimized versions of this function. // to test optimized versions of this function.
...@@ -120,10 +124,10 @@ TEST(Vp9Fdct4x4Test, RoundTripErrorCheck) { ...@@ -120,10 +124,10 @@ TEST(Vp9Fdct4x4Test, RoundTripErrorCheck) {
} }
// Because the bitstream is not frozen yet, use the idct in the codebase. // Because the bitstream is not frozen yet, use the idct in the codebase.
vp9_short_idct4x4_c(test_temp_block, test_output_block, pitch); vp9_short_idct4x4_add_c(test_temp_block, dst, 4);
for (int j = 0; j < 16; ++j) { for (int j = 0; j < 16; ++j) {
const int diff = test_input_block[j] - test_output_block[j]; const int diff = dst[j] - src[j];
const int error = diff * diff; const int error = diff * diff;
if (max_error < error) if (max_error < error)
max_error = error; max_error = error;
......
...@@ -391,8 +391,8 @@ typedef struct macroblockd { ...@@ -391,8 +391,8 @@ typedef struct macroblockd {
int lossless; int lossless;
/* Inverse transform function pointers. */ /* Inverse transform function pointers. */
void (*inv_txm4x4_1)(int16_t *input, int16_t *output, int pitch); void (*inv_txm4x4_1_add)(int16_t *input, uint8_t *dest, int stride);
void (*inv_txm4x4)(int16_t *input, int16_t *output, int pitch); void (*inv_txm4x4_add)(int16_t *input, uint8_t *dest, int stride);
void (*itxm_add)(int16_t *input, uint8_t *dest, int stride, int eob); void (*itxm_add)(int16_t *input, uint8_t *dest, int stride, int eob);
void (*itxm_add_y_block)(int16_t *q, uint8_t *dst, int stride, void (*itxm_add_y_block)(int16_t *q, uint8_t *dst, int stride,
struct macroblockd *xd); struct macroblockd *xd);
......
...@@ -18,12 +18,12 @@ ...@@ -18,12 +18,12 @@
#include "vp9/common/vp9_common.h" #include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_idct.h" #include "vp9/common/vp9_idct.h"
void vp9_short_iwalsh4x4_c(int16_t *input, int16_t *output, int pitch) { void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
int i; int i;
int16_t output[16];
int a1, b1, c1, d1; int a1, b1, c1, d1;
int16_t *ip = input; int16_t *ip = input;
int16_t *op = output; int16_t *op = output;
const int half_pitch = pitch >> 1;
for (i = 0; i < 4; i++) { for (i = 0; i < 4; i++) {
a1 = (ip[0] + ip[3]) >> WHT_UPSCALE_FACTOR; a1 = (ip[0] + ip[3]) >> WHT_UPSCALE_FACTOR;
...@@ -37,63 +37,60 @@ void vp9_short_iwalsh4x4_c(int16_t *input, int16_t *output, int pitch) { ...@@ -37,63 +37,60 @@ void vp9_short_iwalsh4x4_c(int16_t *input, int16_t *output, int pitch) {
op[3] = (d1 - c1) >> 1; op[3] = (d1 - c1) >> 1;
ip += 4; ip += 4;
op += half_pitch; op += 4;
} }
ip = output; ip = output;
op = output;
for (i = 0; i < 4; i++) { for (i = 0; i < 4; i++) {
a1 = ip[half_pitch * 0] + ip[half_pitch * 3]; a1 = ip[4 * 0] + ip[4 * 3];
b1 = ip[half_pitch * 1] + ip[half_pitch * 2]; b1 = ip[4 * 1] + ip[4 * 2];
c1 = ip[half_pitch * 1] - ip[half_pitch * 2]; c1 = ip[4 * 1] - ip[4 * 2];
d1 = ip[half_pitch * 0] - ip[half_pitch * 3]; d1 = ip[4 * 0] - ip[4 * 3];
op[half_pitch * 0] = (a1 + b1 + 1) >> 1; dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] +
op[half_pitch * 1] = (c1 + d1) >> 1; ((a1 + b1 + 1) >> 1));
op[half_pitch * 2] = (a1 - b1) >> 1; dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] +
op[half_pitch * 3] = (d1 - c1) >> 1; ((c1 + d1) >> 1));
dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] +
((a1 - b1) >> 1));
dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] +
((d1 - c1) >> 1));
ip++; ip++;
op++; dest++;
} }
} }
void vp9_short_iwalsh4x4_1_c(int16_t *in, int16_t *out, int pitch) { void vp9_short_iwalsh4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) {
int i; int i;
int16_t tmp[4]; int16_t tmp[4];
int16_t *ip = in; int16_t *ip = in;
int16_t *op = tmp; int16_t *op = tmp;
const int half_pitch = pitch >> 1;
op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1; op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1;
op[1] = op[2] = op[3] = (ip[0] >> WHT_UPSCALE_FACTOR) >> 1; op[1] = op[2] = op[3] = (ip[0] >> WHT_UPSCALE_FACTOR) >> 1;
ip = tmp; ip = tmp;
op = out;
for (i = 0; i < 4; i++) { for (i = 0; i < 4; i++) {
op[half_pitch * 0] = (ip[0] + 1) >> 1; dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] +
op[half_pitch * 1] = op[half_pitch * 2] = op[half_pitch * 3] = ip[0] >> 1; ((ip[0] + 1) >> 1));
dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] +
(ip[0] >> 1));
dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] +
(ip[0] >> 1));
dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] +
(ip[0] >> 1));
ip++; ip++;
op++; dest++;
} }
} }
void vp9_dc_only_inv_walsh_add_c(int input_dc, uint8_t *pred_ptr, void vp9_dc_only_inv_walsh_add_c(int input_dc, uint8_t *pred_ptr,
uint8_t *dst_ptr, uint8_t *dst_ptr,
int pitch, int stride) { int pitch, int stride) {
int r, c;
int16_t dc = input_dc; int16_t dc = input_dc;
int16_t tmp[4 * 4]; vp9_short_iwalsh4x4_1_add_c(&dc, dst_ptr, stride);
vp9_short_iwalsh4x4_1_c(&dc, tmp, 4 << 1);
for (r = 0; r < 4; r++) {
for (c = 0; c < 4; c++)
dst_ptr[c] = clip_pixel(tmp[r * 4 + c] + pred_ptr[c]);
dst_ptr += stride;
pred_ptr += pitch;
}
} }
void vp9_idct4_1d_c(int16_t *input, int16_t *output) { void vp9_idct4_1d_c(int16_t *input, int16_t *output) {
...@@ -116,10 +113,9 @@ void vp9_idct4_1d_c(int16_t *input, int16_t *output) { ...@@ -116,10 +113,9 @@ void vp9_idct4_1d_c(int16_t *input, int16_t *output) {
output[3] = step[0] - step[3]; output[3] = step[0] - step[3];
} }
void vp9_short_idct4x4_c(int16_t *input, int16_t *output, int pitch) { void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
int16_t out[4 * 4]; int16_t out[4 * 4];
int16_t *outptr = out; int16_t *outptr = out;
const int half_pitch = pitch >> 1;
int i, j; int i, j;
int16_t temp_in[4], temp_out[4]; int16_t temp_in[4], temp_out[4];
...@@ -138,22 +134,24 @@ void vp9_short_idct4x4_c(int16_t *input, int16_t *output, int pitch) { ...@@ -138,22 +134,24 @@ void vp9_short_idct4x4_c(int16_t *input, int16_t *output, int pitch) {
temp_in[j] = out[j * 4 + i]; temp_in[j] = out[j * 4 + i];
vp9_idct4_1d(temp_in, temp_out); vp9_idct4_1d(temp_in, temp_out);
for (j = 0; j < 4; ++j) for (j = 0; j < 4; ++j)
output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4); dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+ dest[j * dest_stride + i]);
} }
} }
void vp9_short_idct4x4_1_c(int16_t *input, int16_t *output, int pitch) { void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
int i; int i;
int a1; int a1;
int16_t *op = output;
const int half_pitch = pitch >> 1;
int16_t out = dct_const_round_shift(input[0] * cospi_16_64); int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
out = dct_const_round_shift(out * cospi_16_64); out = dct_const_round_shift(out * cospi_16_64);
a1 = ROUND_POWER_OF_TWO(out, 4); a1 = ROUND_POWER_OF_TWO(out, 4);
for (i = 0; i < 4; i++) { for (i = 0; i < 4; i++) {
op[0] = op[1] = op[2] = op[3] = a1; dest[0] = clip_pixel(dest[0] + a1);
op += half_pitch; dest[1] = clip_pixel(dest[1] + a1);
dest[2] = clip_pixel(dest[2] + a1);
dest[3] = clip_pixel(dest[3] + a1);
dest += dest_stride;
} }
} }
...@@ -285,8 +283,8 @@ static void iadst4_1d(int16_t *input, int16_t *output) { ...@@ -285,8 +283,8 @@ static void iadst4_1d(int16_t *input, int16_t *output) {
output[3] = dct_const_round_shift(s3); output[3] = dct_const_round_shift(s3);
} }
void vp9_short_iht4x4_c(int16_t *input, int16_t *output, void vp9_short_iht4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride,
int pitch, int tx_type) { int tx_type) {
const transform_2d IHT_4[] = { const transform_2d IHT_4[] = {
{ vp9_idct4_1d, vp9_idct4_1d }, // DCT_DCT = 0 { vp9_idct4_1d, vp9_idct4_1d }, // DCT_DCT = 0
{ iadst4_1d, vp9_idct4_1d }, // ADST_DCT = 1 { iadst4_1d, vp9_idct4_1d }, // ADST_DCT = 1
...@@ -312,10 +310,10 @@ void vp9_short_iht4x4_c(int16_t *input, int16_t *output, ...@@ -312,10 +310,10 @@ void vp9_short_iht4x4_c(int16_t *input, int16_t *output,
temp_in[j] = out[j * 4 + i]; temp_in[j] = out[j * 4 + i];
IHT_4[tx_type].cols(temp_in, temp_out); IHT_4[tx_type].cols(temp_in, temp_out);
for (j = 0; j < 4; ++j) for (j = 0; j < 4; ++j)
output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4); dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+ dest[j * dest_stride + i]);
} }
} }
static void iadst8_1d(int16_t *input, int16_t *output) { static void iadst8_1d(int16_t *input, int16_t *output) {
int s0, s1, s2, s3, s4, s5, s6, s7; int s0, s1, s2, s3, s4, s5, s6, s7;
......
...@@ -11,11 +11,10 @@ ...@@ -11,11 +11,10 @@
#include "vp9/common/vp9_invtrans.h" #include "vp9/common/vp9_invtrans.h"
#include "./vp9_rtcd.h" #include "./vp9_rtcd.h"
void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int eob, void vp9_inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob, int16_t *dqcoeff,
int16_t *dqcoeff, int16_t *diff, uint8_t *dest, int stride) {
int pitch) {
if (eob <= 1) if (eob <= 1)
xd->inv_txm4x4_1(dqcoeff, diff, pitch); xd->inv_txm4x4_1_add(dqcoeff, dest, stride);
else else
xd->inv_txm4x4(dqcoeff, diff, pitch); xd->inv_txm4x4_add(dqcoeff, dest, stride);
} }
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
#include "vpx/vpx_integer.h" #include "vpx/vpx_integer.h"
#include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_blockd.h"
void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int eob, void vp9_inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob, int16_t *dqcoeff,
int16_t *dqcoeff, int16_t *diff, uint8_t *dest, int stride);
int pitch);
#endif // VP9_COMMON_VP9_INVTRANS_H_ #endif // VP9_COMMON_VP9_INVTRANS_H_
...@@ -85,9 +85,6 @@ prototype void vp9_intra4x4_predict "struct macroblockd *xd, int block, enum BLO ...@@ -85,9 +85,6 @@ prototype void vp9_intra4x4_predict "struct macroblockd *xd, int block, enum BLO
specialize vp9_intra4x4_predict; specialize vp9_intra4x4_predict;
if [ "$CONFIG_VP9_DECODER" = "yes" ]; then if [ "$CONFIG_VP9_DECODER" = "yes" ]; then
prototype void vp9_add_residual_4x4 "const int16_t *diff, uint8_t *dest, int stride"
specialize vp9_add_residual_4x4 sse2
prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride" prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"
specialize vp9_add_constant_residual_8x8 sse2 specialize vp9_add_constant_residual_8x8 sse2
...@@ -179,11 +176,11 @@ specialize vp9_convolve8_avg_vert ssse3 ...@@ -179,11 +176,11 @@ specialize vp9_convolve8_avg_vert ssse3
# #
# dct # dct
# #
prototype void vp9_short_idct4x4_1 "int16_t *input, int16_t *output, int pitch" prototype void vp9_short_idct4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct4x4_1 specialize vp9_short_idct4x4_1_add
prototype void vp9_short_idct4x4 "int16_t *input, int16_t *output, int pitch" prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct4x4 sse2 specialize vp9_short_idct4x4_add sse2
prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride" prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct8x8_add sse2 specialize vp9_short_idct8x8_add sse2
...@@ -212,12 +209,12 @@ specialize vp9_short_idct1_32x32 ...@@ -212,12 +209,12 @@ specialize vp9_short_idct1_32x32
prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int dest_stride" prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct10_32x32_add specialize vp9_short_idct10_32x32_add
prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
specialize vp9_short_iht4x4_add
prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type" prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
specialize vp9_short_iht8x8_add specialize vp9_short_iht8x8_add
prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type"
specialize vp9_short_iht4x4
prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type" prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type"
specialize vp9_short_iht16x16_add specialize vp9_short_iht16x16_add
...@@ -229,12 +226,11 @@ specialize vp9_idct4_1d sse2 ...@@ -229,12 +226,11 @@ specialize vp9_idct4_1d sse2
prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride" prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
specialize vp9_dc_only_idct_add sse2 specialize vp9_dc_only_idct_add sse2
prototype void vp9_short_iwalsh4x4_1 "int16_t *input, int16_t *output, int pitch" prototype void vp9_short_iwalsh4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_iwalsh4x4_1 specialize vp9_short_iwalsh4x4_1_add
prototype void vp9_short_iwalsh4x4 "int16_t *input, int16_t *output, int pitch"
specialize vp9_short_iwalsh4x4 prototype void vp9_short_iwalsh4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
prototype void vp9_dc_only_inv_walsh_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride" specialize vp9_short_iwalsh4x4_add
specialize vp9_dc_only_inv_walsh_add
prototype unsigned int vp9_sad32x3 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad" prototype unsigned int vp9_sad32x3 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad"
specialize vp9_sad32x3 specialize vp9_sad32x3
......
...@@ -73,7 +73,7 @@ void vp9_dc_only_idct_add_sse2(int input_dc, uint8_t *pred_ptr, ...@@ -73,7 +73,7 @@ void vp9_dc_only_idct_add_sse2(int input_dc, uint8_t *pred_ptr,
*(int *)dst_ptr = _mm_cvtsi128_si32(p1); *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
} }
void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) { void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) {
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
const __m128i eight = _mm_set1_epi16(8); const __m128i eight = _mm_set1_epi16(8);
const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
...@@ -81,7 +81,6 @@ void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) { ...@@ -81,7 +81,6 @@ void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
(int16_t)cospi_24_64, (int16_t)-cospi_8_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
(int16_t)cospi_8_64, (int16_t)cospi_24_64); (int16_t)cospi_8_64, (int16_t)cospi_24_64);
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const int half_pitch = pitch >> 1;
__m128i input0, input1, input2, input3; __m128i input0, input1, input2, input3;
// Rows // Rows
...@@ -188,14 +187,23 @@ void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) { ...@@ -188,14 +187,23 @@ void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
input2 = _mm_srai_epi16(input2, 4); input2 = _mm_srai_epi16(input2, 4);
input3 = _mm_srai_epi16(input3, 4); input3 = _mm_srai_epi16(input3, 4);
// Store results #define RECON_AND_STORE4X4(dest, in_x) \
_mm_storel_epi64((__m128i *)output, input2); { \
input2 = _mm_srli_si128(input2, 8); __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
_mm_storel_epi64((__m128i *)(output + half_pitch), input2); d0 = _mm_unpacklo_epi8(d0, zero); \
d0 = _mm_add_epi16(in_x, d0); \
d0 = _mm_packus_epi16(d0, d0); \
*(int *)dest = _mm_cvtsi128_si32(d0); \
dest += stride; \
}
input0 = _mm_srli_si128(input2, 8);
input1 = _mm_srli_si128(input3, 8);
_mm_storel_epi64((__m128i *)(output + 3 * half_pitch), input3); RECON_AND_STORE4X4(dest, input2);
input3 = _mm_srli_si128(input3, 8); RECON_AND_STORE4X4(dest, input0);
_mm_storel_epi64((__m128i *)(output + 2 * half_pitch), input3); RECON_AND_STORE4X4(dest, input1);
RECON_AND_STORE4X4(dest, input3);
} }
void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) { void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {
......
...@@ -1006,14 +1006,10 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) { ...@@ -1006,14 +1006,10 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
pc->uv_dc_delta_q == 0 && pc->uv_dc_delta_q == 0 &&
pc->uv_ac_delta_q == 0; pc->uv_ac_delta_q == 0;
if (xd->lossless) { if (xd->lossless) {
xd->inv_txm4x4_1 = vp9_short_iwalsh4x4_1;
xd->inv_txm4x4 = vp9_short_iwalsh4x4;
xd->itxm_add = vp9_idct_add_lossless_c; xd->itxm_add = vp9_idct_add_lossless_c;
xd->itxm_add_y_block = vp9_idct_add_y_block_lossless_c; xd->itxm_add_y_block = vp9_idct_add_y_block_lossless_c;
xd->itxm_add_uv_block = vp9_idct_add_uv_block_lossless_c; xd->itxm_add_uv_block = vp9_idct_add_uv_block_lossless_c;
} else { } else {
xd->inv_txm4x4_1 = vp9_short_idct4x4_1;
xd->inv_txm4x4 = vp9_short_idct4x4;
xd->itxm_add = vp9_idct_add; xd->itxm_add = vp9_idct_add;
xd->itxm_add_y_block = vp9_idct_add_y_block; xd->itxm_add_y_block = vp9_idct_add_y_block;
xd->itxm_add_uv_block = vp9_idct_add_uv_block; xd->itxm_add_uv_block = vp9_idct_add_uv_block;
......
...@@ -84,23 +84,6 @@ void vp9_idct_add_uv_block_lossless_c(int16_t *q, uint8_t *dst, int stride, ...@@ -84,23 +84,6 @@ void vp9_idct_add_uv_block_lossless_c(int16_t *q, uint8_t *dst, int stride,
} }
} }
static void add_residual(const int16_t *diff, uint8_t *dest, int stride,
int width, int height) {
int r, c;
for (r = 0; r < height; r++) {
for (c = 0; c < width; c++)
dest[c] = clip_pixel(diff[c] + dest[c]);
dest += stride;
diff += width;
}
}
void vp9_add_residual_4x4_c(const int16_t *diff, uint8_t *dest, int stride) {
add_residual(diff, dest, stride, 4, 4);
}
static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride, static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
int width, int height) { int width, int height) {
int r, c; int r, c;
...@@ -133,11 +116,8 @@ void vp9_iht_add_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest, int stride, ...@@ -133,11 +116,8 @@ void vp9_iht_add_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest, int stride,
if (tx_type == DCT_DCT) { if (tx_type == DCT_DCT) {
vp9_idct_add(input, dest, stride, eob); vp9_idct_add(input, dest, stride, eob);
} else { } else {
DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16); vp9_short_iht4x4_add(input, dest, stride, tx_type);
vp9_short_iht4x4(input, output, 4, tx_type);
vpx_memset(input, 0, 32); vpx_memset(input, 0, 32);
vp9_add_residual_4x4(output, dest, stride);
} }
} }
...@@ -154,13 +134,9 @@ void vp9_iht_add_8x8_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest, ...@@ -154,13 +134,9 @@ void vp9_iht_add_8x8_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
} }
void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob) { void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob) {
DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
if (eob > 1) { if (eob > 1) {
// the idct halves ( >> 1) the pitch vp9_short_idct4x4_add(input, dest, stride);
vp9_short_idct4x4(input, output, 4 << 1);
vpx_memset(input, 0, 32); vpx_memset(input, 0, 32);
vp9_add_residual_4x4(output, dest, stride);
} else { } else {
vp9_dc_only_idct_add(input[0], dest, dest, stride, stride); vp9_dc_only_idct_add(input[0], dest, dest, stride, stride);
((int *)input)[0] = 0; ((int *)input)[0] = 0;
...@@ -168,38 +144,27 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob) { ...@@ -168,38 +144,27 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob) {
} }
void vp9_dc_idct_add_c(int16_t *input, uint8_t *dest, int stride, int dc) { void vp9_dc_idct_add_c(int16_t *input, uint8_t *dest, int stride, int dc) {
DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
input[0] = dc; input[0] = dc;
vp9_short_idct4x4_add(input, dest, stride);
// the idct halves ( >> 1) the pitch
vp9_short_idct4x4(input, output, 4 << 1);
vpx_memset(input, 0, 32); vpx_memset(input, 0, 32);
vp9_add_residual_4x4(output, dest, stride);
} }
void vp9_idct_add_lossless_c(int16_t *input, uint8_t *dest, int stride, void vp9_idct_add_lossless_c(int16_t *input, uint8_t *dest, int stride,
int eob) { int eob) {
DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
if (eob > 1) { if (eob > 1) {
vp9_short_iwalsh4x4_c(input, output, 4 << 1); vp9_short_iwalsh4x4_add(input, dest, stride);
vpx_memset(input, 0, 32); vpx_memset(input, 0, 32);
vp9_add_residual_4x4(output, dest, stride);
} else { } else {
vp9_dc_only_inv_walsh_add(