Commit e253eaa0 authored by Jingning Han's avatar Jingning Han

Unify the high bit-depth forward hybrid transforms

The SSE2 version high bit-depth forward hybrid transforms are
essentially using the C functions via cross referencing to 1-D
functions in vp9_dct.c. This commit unifies the two versions and
removes the unnecessary dependency.

Change-Id: Ib4d0702a138f8daf7d0bd97c141ee7088f293765
parent 389ed6da
......@@ -901,14 +901,6 @@ INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P(
SSE2, Trans16x16HT,
::testing::Values(
make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 0, VPX_BITS_10),
make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 1, VPX_BITS_10),
make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 2, VPX_BITS_10),
make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 3, VPX_BITS_10),
make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 0, VPX_BITS_12),
make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 1, VPX_BITS_12),
make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 2, VPX_BITS_12),
make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 3, VPX_BITS_12),
make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8),
make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
......
......@@ -531,14 +531,6 @@ INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P(
SSE2, Trans4x4HT,
::testing::Values(
make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 0, VPX_BITS_10),
make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 1, VPX_BITS_10),
make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 2, VPX_BITS_10),
make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 3, VPX_BITS_10),
make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 0, VPX_BITS_12),
make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 1, VPX_BITS_12),
make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 2, VPX_BITS_12),
make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 3, VPX_BITS_12),
make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
......
......@@ -933,13 +933,13 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# fdct functions
add_proto qw/void vp9_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/vp9_highbd_fht4x4 sse2/;
specialize qw/vp9_highbd_fht4x4/;
add_proto qw/void vp9_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/vp9_highbd_fht8x8 sse2/;
specialize qw/vp9_highbd_fht8x8/;
add_proto qw/void vp9_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/vp9_highbd_fht16x16 sse2/;
specialize qw/vp9_highbd_fht16x16/;
add_proto qw/void vp9_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_highbd_fwht4x4/;
......
......@@ -2266,108 +2266,6 @@ void vp9_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
store_output(&in1, output);
}
#if CONFIG_VP9_HIGHBITDEPTH
/* These SSE2 versions of the FHT functions only actually use SSE2 in the
* DCT_DCT case in all other cases, they revert to C code which is identical
* to that used by the C versions of them.
*/
void vp9_highbd_fht4x4_sse2(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
if (tx_type == DCT_DCT) {
vp9_highbd_fdct4x4_sse2(input, output, stride);
} else {
tran_low_t out[4 * 4];
tran_low_t *outptr = &out[0];
int i, j;
tran_low_t temp_in[4], temp_out[4];
const transform_2d ht = FHT_4[tx_type];
// Columns
for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j)
temp_in[j] = input[j * stride + i] * 16;
if (i == 0 && temp_in[0])
temp_in[0] += 1;
ht.cols(temp_in, temp_out);
for (j = 0; j < 4; ++j)
outptr[j * 4 + i] = temp_out[j];
}
// Rows
for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j)
temp_in[j] = out[j + i * 4];
ht.rows(temp_in, temp_out);
for (j = 0; j < 4; ++j)
output[j + i * 4] = (temp_out[j] + 1) >> 2;
}
}
}
void vp9_highbd_fht8x8_sse2(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
if (tx_type == DCT_DCT) {
vp9_highbd_fdct8x8_sse2(input, output, stride);
} else {
tran_low_t out[64];
tran_low_t *outptr = &out[0];
int i, j;
tran_low_t temp_in[8], temp_out[8];
const transform_2d ht = FHT_8[tx_type];
// Columns
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j)
temp_in[j] = input[j * stride + i] * 4;
ht.cols(temp_in, temp_out);
for (j = 0; j < 8; ++j)
outptr[j * 8 + i] = temp_out[j];
}
// Rows
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j)
temp_in[j] = out[j + i * 8];
ht.rows(temp_in, temp_out);
for (j = 0; j < 8; ++j)
output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
}
}
}
void vp9_highbd_fht16x16_sse2(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
if (tx_type == DCT_DCT) {
vp9_highbd_fdct16x16_sse2(input, output, stride);
} else {
tran_low_t out[256];
tran_low_t *outptr = &out[0];
int i, j;
tran_low_t temp_in[16], temp_out[16];
const transform_2d ht = FHT_16[tx_type];
// Columns
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j)
temp_in[j] = input[j * stride + i] * 4;
ht.cols(temp_in, temp_out);
for (j = 0; j < 16; ++j)
outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
}
// Rows
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j)
temp_in[j] = out[j + i * 16];
ht.rows(temp_in, temp_out);
for (j = 0; j < 16; ++j)
output[j + i * 16] = temp_out[j];
}
}
}
#endif // CONFIG_VP9_HIGHBITDEPTH
/*
* The DCTnxn functions are defined using the macros below. The main code for
* them is in separate files (vp9/encoder/x86/vp9_dct_sse2_impl.h &
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment