Commit 74854987 authored by Debargha Mukherjee's avatar Debargha Mukherjee
Browse files

Extends ext-tx to support 32x32 masked transforms

Adds new 32x32 masked 1-d transforms that combine 1-D length-16
DCT with length-16 identity transforms.

To be continued in subsequent patches.

Change-Id: I0b4f66492d44c079b3c3b531ba48a97201de1484
parent 907544a3
......@@ -259,6 +259,73 @@ void idst16_c(const tran_low_t *input, tran_low_t *output) {
output[15] = WRAPLOW(-step2[0] + step2[15], 8);
}
#if CONFIG_EXT_TX
// For use in lieu of DST
static void ihalfcenter32_c(const tran_low_t *input, tran_low_t *output) {
int i;
tran_low_t inputhalf[16];
for (i = 0; i < 8; ++i) {
output[i] = input[16 + i] * 4;
output[24 + i] = input[24 + i] * 4;
}
// Multiply input by sqrt(2)
for (i = 0; i < 16; ++i) {
inputhalf[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
}
idct16_c(inputhalf, output + 8);
// Note overall scaling factor is 4 times orthogonal
}
static void ihalfright32_c(const tran_low_t *input, tran_low_t *output) {
int i;
tran_low_t inputhalf[16];
for (i = 0; i < 16; ++i) {
output[i] = input[16 + i] * 4;
}
// Multiply input by sqrt(2)
for (i = 0; i < 16; ++i) {
inputhalf[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
}
idct16_c(inputhalf, output + 16);
// Note overall scaling factor is 4 times orthogonal
}
#if CONFIG_VP9_HIGHBITDEPTH
static void highbd_ihalfcenter32_c(const tran_low_t *input, tran_low_t *output,
int bd) {
int i;
tran_low_t inputhalf[16];
for (i = 0; i < 8; ++i) {
output[i] = input[16 + i] * 4;
output[24 + i] = input[24 + i] * 4;
}
// Multiply input by sqrt(2)
for (i = 0; i < 16; ++i) {
inputhalf[i] = (tran_low_t)highbd_dct_const_round_shift(
input[i] * Sqrt2, bd);
}
vpx_highbd_idct16_c(inputhalf, output + 8, bd);
// Note overall scaling factor is 4 times orthogonal
}
static void highbd_ihalfright32_c(const tran_low_t *input, tran_low_t *output,
int bd) {
int i;
tran_low_t inputhalf[16];
for (i = 0; i < 16; ++i) {
output[i] = input[16 + i] * 4;
}
// Multiply input by sqrt(2)
for (i = 0; i < 16; ++i) {
inputhalf[i] = (tran_low_t)highbd_dct_const_round_shift(
input[i] * Sqrt2, bd);
}
vpx_highbd_idct16_c(inputhalf, output + 16, bd);
// Note overall scaling factor is 4 times orthogonal
}
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // CONFIG_EXT_TX
// Inverse identiy transform and add.
static void inv_idtx_add_c(const tran_low_t *input, uint8_t *dest, int stride,
int bs) {
......@@ -808,6 +875,67 @@ void vp10_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
}
}
#if CONFIG_EXT_TX
void vp10_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
int stride, int tx_type) {
static const transform_2d IHT_32[] = {
{ idct32_c, idct32_c }, // DCT_DCT = 0,
{ ihalfright32_c, idct32_c }, // ADST_DCT = 1,
{ idct32_c, ihalfright32_c }, // DCT_ADST = 2,
{ ihalfright32_c, ihalfright32_c }, // ADST_ADST = 3,
{ ihalfright32_c, idct32_c }, // FLIPADST_DCT = 4,
{ idct32_c, ihalfright32_c }, // DCT_FLIPADST = 5,
{ ihalfright32_c, ihalfright32_c }, // FLIPADST_FLIPADST = 6,
{ ihalfright32_c, ihalfright32_c }, // ADST_FLIPADST = 7,
{ ihalfright32_c, ihalfright32_c }, // FLIPADST_ADST = 8,
{ ihalfcenter32_c, idct32_c }, // DST_DCT = 9,
{ idct32_c, ihalfcenter32_c }, // DCT_DST = 10,
{ ihalfcenter32_c, ihalfright32_c }, // DST_ADST = 11,
{ ihalfright32_c, ihalfcenter32_c }, // ADST_DST = 12,
{ ihalfcenter32_c, ihalfright32_c }, // DST_FLIPADST = 13,
{ ihalfright32_c, ihalfcenter32_c }, // FLIPADST_DST = 14,
{ ihalfcenter32_c, ihalfcenter32_c }, // DST_DST = 15
};
int i, j;
tran_low_t tmp;
tran_low_t out[32][32];
tran_low_t *outp = &out[0][0];
int outstride = 32;
// inverse transform row vectors
for (i = 0; i < 32; ++i) {
IHT_32[tx_type].rows(input, out[i]);
input += 32;
}
// transpose
for (i = 1 ; i < 32; i++) {
for (j = 0; j < i; j++) {
tmp = out[i][j];
out[i][j] = out[j][i];
out[j][i] = tmp;
}
}
// inverse transform column vectors
for (i = 0; i < 32; ++i) {
IHT_32[tx_type].cols(out[i], out[i]);
}
maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 32);
// Sum with the destination
for (i = 0; i < 32; ++i) {
for (j = 0; j < 32; ++j) {
int d = i * stride + j;
int s = j * outstride + i;
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
}
}
}
#endif // CONFIG_EXT_TX
// idct
void vp10_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
int eob) {
......@@ -998,15 +1126,27 @@ void vp10_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
vp10_idct32x32_add(input, dest, stride, eob);
break;
#if CONFIG_EXT_TX
case IDTX:
inv_idtx_add_c(input, dest, stride, 32);
break;
#endif // CONFIG_EXT_TX
case ADST_DCT:
case DCT_ADST:
case ADST_ADST:
assert(0);
case FLIPADST_DCT:
case DCT_FLIPADST:
case FLIPADST_FLIPADST:
case ADST_FLIPADST:
case FLIPADST_ADST:
case DST_DST:
case DST_DCT:
case DCT_DST:
case DST_ADST:
case ADST_DST:
case FLIPADST_DST:
case DST_FLIPADST:
vp10_iht32x32_1024_add_c(input, dest, stride, tx_type);
break;
case IDTX:
inv_idtx_add_c(input, dest, stride, 32);
break;
#endif // CONFIG_EXT_TX
default:
assert(0);
break;
......@@ -1212,6 +1352,70 @@ void vp10_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
}
}
#if CONFIG_EXT_TX
void vp10_highbd_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
int stride, int tx_type, int bd) {
static const highbd_transform_2d HIGH_IHT_32[] = {
{ vpx_highbd_idct32_c, vpx_highbd_idct32_c }, // DCT_DCT
{ highbd_ihalfright32_c, vpx_highbd_idct32_c }, // ADST_DCT
{ vpx_highbd_idct32_c, highbd_ihalfright32_c }, // DCT_ADST
{ highbd_ihalfright32_c, highbd_ihalfright32_c }, // ADST_ADST
{ highbd_ihalfright32_c, vpx_highbd_idct32_c }, // FLIPADST_DCT
{ vpx_highbd_idct32_c, highbd_ihalfright32_c }, // DCT_FLIPADST
{ highbd_ihalfright32_c, highbd_ihalfright32_c }, // FLIPADST_FLIPADST
{ highbd_ihalfright32_c, highbd_ihalfright32_c }, // ADST_FLIPADST
{ highbd_ihalfright32_c, highbd_ihalfright32_c }, // FLIPADST_ADST
{ highbd_ihalfcenter32_c, vpx_highbd_idct32_c }, // DST_DCT
{ vpx_highbd_idct32_c, highbd_ihalfcenter32_c }, // DCT_DST
{ highbd_ihalfcenter32_c, highbd_ihalfright32_c }, // DST_ADST
{ highbd_ihalfright32_c, highbd_ihalfcenter32_c }, // ADST_DST
{ highbd_ihalfcenter32_c, highbd_ihalfright32_c }, // DST_FLIPADST
{ highbd_ihalfright32_c, highbd_ihalfcenter32_c }, // FLIPADST_DST
{ highbd_ihalfcenter32_c, highbd_ihalfcenter32_c }, // DST_DST
};
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
int i, j;
tran_low_t tmp;
tran_low_t out[32][32];
tran_low_t *outp = &out[0][0];
int outstride = 32;
// inverse transform row vectors
for (i = 0; i < 32; ++i) {
HIGH_IHT_32[tx_type].rows(input, out[i], bd);
input += 32;
}
// transpose
for (i = 1 ; i < 32; i++) {
for (j = 0; j < i; j++) {
tmp = out[i][j];
out[i][j] = out[j][i];
out[j][i] = tmp;
}
}
// inverse transform column vectors
for (i = 0; i < 32; ++i) {
HIGH_IHT_32[tx_type].cols(out[i], out[i], bd);
}
maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 32);
// Sum with the destination
for (i = 0; i < 32; ++i) {
for (j = 0; j < 32; ++j) {
int d = i * stride + j;
int s = j * outstride + i;
dest[d] = highbd_clip_pixel_add(dest[d],
ROUND_POWER_OF_TWO(outp[s], 6), bd);
}
}
}
#endif // CONFIG_EXT_TX
// idct
void vp10_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
int eob, int bd) {
......@@ -1409,15 +1613,27 @@ void vp10_highbd_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
vp10_highbd_idct32x32_add(input, dest, stride, eob, bd);
break;
#if CONFIG_EXT_TX
case IDTX:
highbd_inv_idtx_add_c(input, dest, stride, 32, bd);
break;
#endif // CONFIG_EXT_TX
case ADST_DCT:
case DCT_ADST:
case ADST_ADST:
assert(0);
case FLIPADST_DCT:
case DCT_FLIPADST:
case FLIPADST_FLIPADST:
case ADST_FLIPADST:
case FLIPADST_ADST:
case DST_DST:
case DST_DCT:
case DCT_DST:
case DST_ADST:
case ADST_DST:
case FLIPADST_DST:
case DST_FLIPADST:
vp10_highbd_iht32x32_1024_add_c(input, dest, stride, tx_type, bd);
break;
case IDTX:
highbd_inv_idtx_add_c(input, dest, stride, 32, bd);
break;
#endif // CONFIG_EXT_TX
default:
assert(0);
break;
......
......@@ -404,6 +404,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vp10_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/vp10_fht16x16 sse2/;
add_proto qw/void vp10_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/vp10_fht32x32/;
add_proto qw/void vp10_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp10_fwht4x4/, "$mmx_x86inc";
} else {
......@@ -416,6 +419,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vp10_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/vp10_fht16x16 sse2 msa/;
add_proto qw/void vp10_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/vp10_fht32x32/;
add_proto qw/void vp10_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp10_fwht4x4 msa/, "$mmx_x86inc";
}
......@@ -642,6 +648,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vp10_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/vp10_highbd_fht16x16/;
add_proto qw/void vp10_highbd_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/vp10_highbd_fht32x32/;
add_proto qw/void vp10_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp10_highbd_fwht4x4/;
......
......@@ -14,7 +14,6 @@
#include "./vp10_rtcd.h"
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vp10/common/blockd.h"
#include "vp10/common/idct.h"
#include "vpx_dsp/fwd_txfm.h"
......@@ -538,7 +537,7 @@ static void fdct16(const tran_low_t *input, tran_low_t *output) {
range_check(output, 16, 16);
}
/* TODO(angiebird): Unify this with vp10_fwd_txfm.c: vp10_fdct32
#if CONFIG_EXT_TX
static void fdct32(const tran_low_t *input, tran_low_t *output) {
tran_high_t temp;
tran_low_t step[32];
......@@ -936,7 +935,7 @@ static void fdct32(const tran_low_t *input, tran_low_t *output) {
range_check(output, 32, 18);
}
*/
#endif // CONFIG_EXT_TX
static void fadst4(const tran_low_t *input, tran_low_t *output) {
tran_high_t x0, x1, x2, x3;
......@@ -1213,6 +1212,37 @@ static void fadst16(const tran_low_t *input, tran_low_t *output) {
}
#if CONFIG_EXT_TX
// For use in lieu of DST
static void fhalfcenter32(const tran_low_t *input, tran_low_t *output) {
int i;
tran_low_t inputhalf[16];
for (i = 0; i < 8; ++i) {
output[16 + i] = input[i] * 4;
output[24 + i] = input[24 + i] * 4;
}
// Multiply input by sqrt(2)
for (i = 0; i < 16; ++i) {
inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 8] * Sqrt2);
}
fdct16(inputhalf, output);
// Note overall scaling factor is 4 times orthogonal
}
// For use in lieu of ADST
static void fhalfright32(const tran_low_t *input, tran_low_t *output) {
int i;
tran_low_t inputhalf[16];
for (i = 0; i < 16; ++i) {
output[16 + i] = input[i] * 4;
}
// Multiply input by sqrt(2)
for (i = 0; i < 16; ++i) {
inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 16] * Sqrt2);
}
fdct16(inputhalf, output);
// Note overall scaling factor is 4 times orthogonal
}
static void copy_block(const int16_t *src, int src_stride, int l,
int16_t *dest, int dest_stride) {
int i;
......@@ -1375,6 +1405,27 @@ static const transform_2d FHT_16[] = {
#endif // CONFIG_EXT_TX
};
#if CONFIG_EXT_TX
static const transform_2d FHT_32[] = {
{ fdct32, fdct32 }, // DCT_DCT = 0,
{ fhalfright32, fdct32 }, // ADST_DCT = 1,
{ fdct32, fhalfright32 }, // DCT_ADST = 2,
{ fhalfright32, fhalfright32 }, // ADST_ADST = 3,
{ fhalfright32, fdct32 }, // FLIPADST_DCT = 4,
{ fdct32, fhalfright32 }, // DCT_FLIPADST = 5,
{ fhalfright32, fhalfright32 }, // FLIPADST_FLIPADST = 6,
{ fhalfright32, fhalfright32 }, // ADST_FLIPADST = 7,
{ fhalfright32, fhalfright32 }, // FLIPADST_ADST = 8,
{ fhalfcenter32, fdct32 }, // DST_DCT = 9,
{ fdct32, fhalfcenter32 }, // DCT_DST = 10,
{ fhalfcenter32, fhalfright32 }, // DST_ADST = 11,
{ fhalfright32, fhalfcenter32 }, // ADST_DST = 12,
{ fhalfcenter32, fhalfright32 }, // DST_FLIPADST = 13,
{ fhalfright32, fhalfcenter32 }, // FLIPADST_DST = 14,
{ fhalfcenter32, fhalfcenter32 }, // DST_DST = 15
};
#endif // CONFIG_EXT_TX
void vp10_fht4x4_c(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
if (tx_type == DCT_DCT) {
......@@ -1671,3 +1722,46 @@ void vp10_highbd_fht16x16_c(const int16_t *input, tran_low_t *output,
vp10_fht16x16_c(input, output, stride, tx_type);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
#if CONFIG_EXT_TX
void vp10_fht32x32_c(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
if (tx_type == DCT_DCT) {
vpx_fdct32x32_c(input, output, stride);
} else {
tran_low_t out[1024];
int i, j;
tran_low_t temp_in[32], temp_out[32];
const transform_2d ht = FHT_32[tx_type];
int16_t flipped_input[32 * 32];
maybe_flip_input(&input, &stride, 32, flipped_input, tx_type);
// Columns
for (i = 0; i < 32; ++i) {
for (j = 0; j < 32; ++j)
temp_in[j] = input[j * stride + i] * 4;
ht.cols(temp_in, temp_out);
for (j = 0; j < 32; ++j)
out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
}
// Rows
for (i = 0; i < 32; ++i) {
for (j = 0; j < 32; ++j)
temp_in[j] = out[j + i * 32];
ht.rows(temp_in, temp_out);
for (j = 0; j < 32; ++j)
output[j + i * 32] =
(tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
}
}
}
#if CONFIG_VP9_HIGHBITDEPTH
void vp10_highbd_fht32x32_c(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
vp10_fht32x32_c(input, output, stride, tx_type);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // CONFIG_EXT_TX
......@@ -2057,8 +2057,8 @@ void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
}
}
static void highbd_idct32_c(const tran_low_t *input,
tran_low_t *output, int bd) {
void vpx_highbd_idct32_c(const tran_low_t *input,
tran_low_t *output, int bd) {
tran_low_t step1[32], step2[32];
tran_high_t temp1, temp2;
(void) bd;
......@@ -2447,7 +2447,7 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
if (zero_coeff[0] | zero_coeff[1])
highbd_idct32_c(input, outptr, bd);
vpx_highbd_idct32_c(input, outptr, bd);
else
memset(outptr, 0, sizeof(tran_low_t) * 32);
input += 32;
......@@ -2458,7 +2458,7 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
for (i = 0; i < 32; ++i) {
for (j = 0; j < 32; ++j)
temp_in[j] = out[j * 32 + i];
highbd_idct32_c(temp_in, temp_out, bd);
vpx_highbd_idct32_c(temp_in, temp_out, bd);
for (j = 0; j < 32; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
......@@ -2477,7 +2477,7 @@ void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
// Rows
// Only upper-left 8x8 has non-zero coeff.
for (i = 0; i < 8; ++i) {
highbd_idct32_c(input, outptr, bd);
vpx_highbd_idct32_c(input, outptr, bd);
input += 32;
outptr += 32;
}
......@@ -2485,7 +2485,7 @@ void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
for (i = 0; i < 32; ++i) {
for (j = 0; j < 32; ++j)
temp_in[j] = out[j * 32 + i];
highbd_idct32_c(temp_in, temp_out, bd);
vpx_highbd_idct32_c(temp_in, temp_out, bd);
for (j = 0; j < 32; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
......
......@@ -100,6 +100,7 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output);
void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd);
void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd);
void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd);
void vpx_highbd_idct32_c(const tran_low_t *input, tran_low_t *output, int bd);
void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd);
void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd);
......
......@@ -57,10 +57,13 @@ static const tran_high_t cospi_29_64 = 2404;
static const tran_high_t cospi_30_64 = 1606;
static const tran_high_t cospi_31_64 = 804;
// 16384 * sqrt(2) * sin(kPi/9) * 2 / 3
// 16384 * sqrt(2) * sin(kPi/9) * 2 / 3
static const tran_high_t sinpi_1_9 = 5283;
static const tran_high_t sinpi_2_9 = 9929;
static const tran_high_t sinpi_3_9 = 13377;
static const tran_high_t sinpi_4_9 = 15212;
// 16384 * sqrt(2)
static const tran_high_t Sqrt2 = 23170;
#endif // VPX_DSP_TXFM_COMMON_H_
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment