Commit 31c66502 authored by Sarah Parker's avatar Sarah Parker

Remove deprecated high-bitdepth functions

This unifies the codepath for high-bitdepth transforms and deletes
all calls to the old deprecated versions. This required reworking
the way 1d configurations are combined in order to support rectangular
transforms.

There is one remaining codepath that calls the deprecated 4x4 hbd
transform from encoder/encodemb.c. I need to take a closer look
at what is happening there and will leave that for a followup
since this change has already gotten so large.

lowres 10 bit: -0.035%
lowres 12 bit: 0.021%

BUG=aomedia:524

Change-Id: I34cdeaed2461ed7942364147cef10d7d21e3779c
parent ee4a34c2
......@@ -20,10 +20,5 @@ static INLINE tran_high_t saturate_int16(tran_high_t value) {
return result < INT16_MIN ? INT16_MIN : result;
}
static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
return rv;
}
void aom_fdct32(const tran_high_t *input, tran_high_t *output, int round);
#endif // AOM_DSP_FWD_TXFM_H_
This diff is collapsed.
......@@ -67,4 +67,8 @@ static const tran_high_t sinpi_4_9 = 15212;
// 16384 * sqrt(2)
static const tran_high_t Sqrt2 = 23170;
static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
return rv;
}
#endif // AOM_DSP_TXFM_COMMON_H_
......@@ -3627,108 +3627,4 @@ void aom_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
}
}
}
void aom_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
int stride, int bd) {
tran_low_t out[8 * 8] = { 0 };
tran_low_t *outptr = out;
int i, j, test;
__m128i inptr[8];
__m128i min_input, max_input, temp1, temp2, sign_bits;
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
const __m128i zero = _mm_set1_epi16(0);
const __m128i sixteen = _mm_set1_epi16(16);
const __m128i max = _mm_set1_epi16(6201);
const __m128i min = _mm_set1_epi16(-6201);
int optimised_cols = 0;
// Load input into __m128i & pack to 16 bits
for (i = 0; i < 8; i++) {
temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
inptr[i] = _mm_packs_epi32(temp1, temp2);
}
// Find the min & max for the row transform
// only first 4 row has non-zero coefs
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
for (i = 2; i < 4; i++) {
max_input = _mm_max_epi16(max_input, inptr[i]);
min_input = _mm_min_epi16(min_input, inptr[i]);
}
max_input = _mm_cmpgt_epi16(max_input, max);
min_input = _mm_cmplt_epi16(min_input, min);
temp1 = _mm_or_si128(max_input, min_input);
test = _mm_movemask_epi8(temp1);
if (!test) {
// Do the row transform
aom_idct8_sse2(inptr);
// Find the min & max for the column transform
// N.B. Only first 4 cols contain non-zero coeffs
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
for (i = 2; i < 8; i++) {
max_input = _mm_max_epi16(max_input, inptr[i]);
min_input = _mm_min_epi16(min_input, inptr[i]);
}
max_input = _mm_cmpgt_epi16(max_input, max);
min_input = _mm_cmplt_epi16(min_input, min);
temp1 = _mm_or_si128(max_input, min_input);
test = _mm_movemask_epi8(temp1);
if (test) {
// Use fact only first 4 rows contain non-zero coeffs
array_transpose_4X8(inptr, inptr);
for (i = 0; i < 4; i++) {
sign_bits = _mm_cmplt_epi16(inptr[i], zero);
temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
_mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
_mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
}
} else {
// Set to use the optimised transform for the column
optimised_cols = 1;
}
} else {
// Run the un-optimised row transform
for (i = 0; i < 4; ++i) {
aom_highbd_idct8_c(input, outptr, bd);
input += 8;
outptr += 8;
}
}
if (optimised_cols) {
aom_idct8_sse2(inptr);
// Final round & shift and Reconstruction and Store
{
__m128i d[8];
for (i = 0; i < 8; i++) {
inptr[i] = _mm_add_epi16(inptr[i], sixteen);
d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
inptr[i] = _mm_srai_epi16(inptr[i], 5);
d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
// Store
_mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
}
}
} else {
// Run the un-optimised column transform
tran_low_t temp_in[8], temp_out[8];
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
aom_highbd_idct8_c(temp_in, temp_out, bd);
for (j = 0; j < 8; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
}
}
}
}
#endif // CONFIG_HIGHBITDEPTH
......@@ -12,6 +12,7 @@
#include <assert.h>
#include "./av1_rtcd.h"
#include "aom_dsp/txfm_common.h"
#include "av1/common/enums.h"
#include "av1/common/av1_fwd_txfm1d.h"
#include "av1/common/av1_fwd_txfm1d_cfg.h"
......@@ -41,9 +42,17 @@ static INLINE void fwd_txfm2d_c(const int16_t *input, int32_t *output,
const int stride, const TXFM_2D_FLIP_CFG *cfg,
int32_t *buf) {
int c, r;
// TODO(sarahparker) must correct for rectangular transforms in follow up
const int txfm_size = cfg->row_cfg->txfm_size;
const int8_t *shift = cfg->row_cfg->shift;
// Note when assigning txfm_size_col, we use the txfm_size from the
// row configuration and vice versa. This is intentionally done to
// accurately perform rectangular transforms. When the transform is
// rectangular, the number of columns will be the same as the
// txfm_size stored in the row cfg struct. It will make no difference
// for square transforms.
const int txfm_size_col = cfg->row_cfg->txfm_size;
const int txfm_size_row = cfg->col_cfg->txfm_size;
// Take the shift from the larger dimension in the rectangular case.
const int8_t *shift =
txfm_size_col > txfm_size_row ? cfg->row_cfg->shift : cfg->col_cfg->shift;
const int8_t *stage_range_col = cfg->col_cfg->stage_range;
const int8_t *stage_range_row = cfg->row_cfg->stage_range;
const int8_t *cos_bit_col = cfg->col_cfg->cos_bit;
......@@ -53,37 +62,99 @@ static INLINE void fwd_txfm2d_c(const int16_t *input, int32_t *output,
// use output buffer as temp buffer
int32_t *temp_in = output;
int32_t *temp_out = output + txfm_size;
int32_t *temp_out = output + txfm_size_row;
// Columns
for (c = 0; c < txfm_size; ++c) {
for (c = 0; c < txfm_size_col; ++c) {
if (cfg->ud_flip == 0) {
for (r = 0; r < txfm_size; ++r) temp_in[r] = input[r * stride + c];
for (r = 0; r < txfm_size_row; ++r) temp_in[r] = input[r * stride + c];
} else {
for (r = 0; r < txfm_size; ++r)
for (r = 0; r < txfm_size_row; ++r)
// flip upside down
temp_in[r] = input[(txfm_size - r - 1) * stride + c];
temp_in[r] = input[(txfm_size_row - r - 1) * stride + c];
}
round_shift_array(temp_in, txfm_size_row, -shift[0]);
// Multiply everything by Sqrt2 on the larger dimension if the
// transform is rectangular
if (txfm_size_col > txfm_size_row) {
for (r = 0; r < txfm_size_row; ++r)
temp_in[r] = (int32_t)fdct_round_shift(temp_in[r] * Sqrt2);
}
round_shift_array(temp_in, txfm_size, -shift[0]);
txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
round_shift_array(temp_out, txfm_size, -shift[1]);
round_shift_array(temp_out, txfm_size_row, -shift[1]);
if (cfg->lr_flip == 0) {
for (r = 0; r < txfm_size; ++r) buf[r * txfm_size + c] = temp_out[r];
for (r = 0; r < txfm_size_row; ++r)
buf[r * txfm_size_col + c] = temp_out[r];
} else {
for (r = 0; r < txfm_size; ++r)
for (r = 0; r < txfm_size_row; ++r)
// flip from left to right
buf[r * txfm_size + (txfm_size - c - 1)] = temp_out[r];
buf[r * txfm_size_col + (txfm_size_col - c - 1)] = temp_out[r];
}
}
// Rows
for (r = 0; r < txfm_size; ++r) {
txfm_func_row(buf + r * txfm_size, output + r * txfm_size, cos_bit_row,
stage_range_row);
round_shift_array(output + r * txfm_size, txfm_size, -shift[2]);
for (r = 0; r < txfm_size_row; ++r) {
// Multiply everything by Sqrt2 on the larger dimension if the
// transform is rectangular
if (txfm_size_row > txfm_size_col) {
for (c = 0; c < txfm_size_col; ++c)
buf[r * txfm_size_col + c] =
(int32_t)fdct_round_shift(buf[r * txfm_size_col + c] * Sqrt2);
}
txfm_func_row(buf + r * txfm_size_col, output + r * txfm_size_col,
cos_bit_row, stage_range_row);
round_shift_array(output + r * txfm_size_col, txfm_size_col, -shift[2]);
}
}
void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride,
int tx_type, int bd) {
int32_t txfm_buf[4 * 8];
TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_4X8);
(void)bd;
fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf);
}
void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride,
int tx_type, int bd) {
int32_t txfm_buf[8 * 4];
TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_8X4);
(void)bd;
fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf);
}
void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride,
int tx_type, int bd) {
int32_t txfm_buf[8 * 16];
TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_8X16);
(void)bd;
fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf);
}
void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride,
int tx_type, int bd) {
int32_t txfm_buf[16 * 8];
TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_16X8);
(void)bd;
fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf);
}
void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride,
int tx_type, int bd) {
int32_t txfm_buf[16 * 32];
TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_16X32);
(void)bd;
fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf);
}
void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride,
int tx_type, int bd) {
int32_t txfm_buf[32 * 16];
TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_32X16);
(void)bd;
fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf);
}
void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride,
int tx_type, int bd) {
int32_t txfm_buf[4 * 4];
......@@ -195,8 +266,10 @@ TXFM_2D_FLIP_CFG av1_get_fwd_txfm_cfg(int tx_type, int tx_size) {
set_flip_cfg(tx_type, &cfg);
int tx_type_col = vtx_tab[tx_type];
int tx_type_row = htx_tab[tx_type];
cfg.col_cfg = fwd_txfm_col_cfg_ls[tx_type_col][tx_size];
cfg.row_cfg = fwd_txfm_row_cfg_ls[tx_type_row][tx_size];
int tx_size_col = txsize_vert_map[tx_size];
int tx_size_row = txsize_horz_map[tx_size];
cfg.col_cfg = fwd_txfm_col_cfg_ls[tx_type_col][tx_size_col];
cfg.row_cfg = fwd_txfm_row_cfg_ls[tx_type_row][tx_size_row];
return cfg;
}
......
......@@ -10,6 +10,7 @@
*/
#include "./av1_rtcd.h"
#include "aom_dsp/inv_txfm.h"
#include "av1/common/enums.h"
#include "av1/common/av1_txfm.h"
#include "av1/common/av1_inv_txfm1d.h"
......@@ -106,10 +107,10 @@ TXFM_2D_FLIP_CFG av1_get_inv_txfm_cfg(int tx_type, int tx_size) {
set_flip_cfg(tx_type, &cfg);
int tx_type_col = vtx_tab[tx_type];
int tx_type_row = htx_tab[tx_type];
// TODO(sarahparker) this is currently only implemented for
// square transforms
cfg.col_cfg = inv_txfm_col_cfg_ls[tx_type_col][tx_size];
cfg.row_cfg = inv_txfm_row_cfg_ls[tx_type_row][tx_size];
int tx_size_col = txsize_vert_map[tx_size];
int tx_size_row = txsize_horz_map[tx_size];
cfg.col_cfg = inv_txfm_col_cfg_ls[tx_type_col][tx_size_col];
cfg.row_cfg = inv_txfm_row_cfg_ls[tx_type_row][tx_size_row];
return cfg;
}
......@@ -129,9 +130,17 @@ TXFM_2D_FLIP_CFG av1_get_inv_txfm_64x64_cfg(int tx_type) {
static INLINE void inv_txfm2d_add_c(const int32_t *input, int16_t *output,
int stride, TXFM_2D_FLIP_CFG *cfg,
int32_t *txfm_buf) {
// TODO(sarahparker) must correct for rectangular transforms in follow up
const int txfm_size = cfg->row_cfg->txfm_size;
const int8_t *shift = cfg->row_cfg->shift;
// Note when assigning txfm_size_col, we use the txfm_size from the
// row configuration and vice versa. This is intentionally done to
// accurately perform rectangular transforms. When the transform is
// rectangular, the number of columns will be the same as the
// txfm_size stored in the row cfg struct. It will make no difference
// for square transforms.
const int txfm_size_col = cfg->row_cfg->txfm_size;
const int txfm_size_row = cfg->col_cfg->txfm_size;
// Take the shift from the larger dimension in the rectangular case.
const int8_t *shift =
txfm_size_col > txfm_size_row ? cfg->row_cfg->shift : cfg->col_cfg->shift;
const int8_t *stage_range_col = cfg->col_cfg->stage_range;
const int8_t *stage_range_row = cfg->row_cfg->stage_range;
const int8_t *cos_bit_col = cfg->col_cfg->cos_bit;
......@@ -139,39 +148,45 @@ static INLINE void inv_txfm2d_add_c(const int32_t *input, int16_t *output,
const TxfmFunc txfm_func_col = inv_txfm_type_to_func(cfg->col_cfg->txfm_type);
const TxfmFunc txfm_func_row = inv_txfm_type_to_func(cfg->row_cfg->txfm_type);
// txfm_buf's length is txfm_size * txfm_size + 2 * txfm_size
// txfm_buf's length is txfm_size_row * txfm_size_col + 2 * txfm_size_row
// it is used for intermediate data buffering
int32_t *temp_in = txfm_buf;
int32_t *temp_out = temp_in + txfm_size;
int32_t *buf = temp_out + txfm_size;
int32_t *temp_out = temp_in + txfm_size_row;
int32_t *buf = temp_out + txfm_size_row;
int32_t *buf_ptr = buf;
int c, r;
// Rows
for (r = 0; r < txfm_size; ++r) {
for (r = 0; r < txfm_size_row; ++r) {
txfm_func_row(input, buf_ptr, cos_bit_row, stage_range_row);
round_shift_array(buf_ptr, txfm_size, -shift[0]);
input += txfm_size;
buf_ptr += txfm_size;
round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
// Multiply everything by Sqrt2 if the transform is rectangular
if (txfm_size_row != txfm_size_col) {
for (c = 0; c < txfm_size_col; ++c)
buf_ptr[c] = (int32_t)dct_const_round_shift(buf_ptr[c] * Sqrt2);
}
input += txfm_size_col;
buf_ptr += txfm_size_col;
}
// Columns
for (c = 0; c < txfm_size; ++c) {
for (c = 0; c < txfm_size_col; ++c) {
if (cfg->lr_flip == 0) {
for (r = 0; r < txfm_size; ++r) temp_in[r] = buf[r * txfm_size + c];
for (r = 0; r < txfm_size_row; ++r)
temp_in[r] = buf[r * txfm_size_col + c];
} else {
// flip left right
for (r = 0; r < txfm_size; ++r)
temp_in[r] = buf[r * txfm_size + (txfm_size - c - 1)];
for (r = 0; r < txfm_size_row; ++r)
temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
}
txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
round_shift_array(temp_out, txfm_size, -shift[1]);
round_shift_array(temp_out, txfm_size_row, -shift[1]);
if (cfg->ud_flip == 0) {
for (r = 0; r < txfm_size; ++r) output[r * stride + c] += temp_out[r];
for (r = 0; r < txfm_size_row; ++r) output[r * stride + c] += temp_out[r];
} else {
// flip upside down
for (r = 0; r < txfm_size; ++r)
output[r * stride + c] += temp_out[txfm_size - r - 1];
for (r = 0; r < txfm_size_row; ++r)
output[r * stride + c] += temp_out[txfm_size_row - r - 1];
}
}
}
......@@ -185,11 +200,44 @@ static INLINE void inv_txfm2d_add_facade(const int32_t *input, uint16_t *output,
// int16_t*
TXFM_2D_FLIP_CFG cfg = av1_get_inv_txfm_cfg(tx_type, tx_size);
inv_txfm2d_add_c(input, (int16_t *)output, stride, &cfg, txfm_buf);
// TODO(sarahparker) just using the cfg_row->txfm_size for now because
// we are assumint this is only used for square transforms. This will
// be adjusted in a follow up
clamp_block((int16_t *)output, cfg.row_cfg->txfm_size, stride, 0,
(1 << bd) - 1);
clamp_block((int16_t *)output, cfg.col_cfg->txfm_size, cfg.row_cfg->txfm_size,
stride, 0, (1 << bd) - 1);
}
void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output,
int stride, int tx_type, int bd) {
int txfm_buf[4 * 8 + 8 + 8];
inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X8, bd);
}
void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output,
int stride, int tx_type, int bd) {
int txfm_buf[8 * 4 + 4 + 4];
inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X4, bd);
}
void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output,
int stride, int tx_type, int bd) {
int txfm_buf[8 * 16 + 16 + 16];
inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X16, bd);
}
void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output,
int stride, int tx_type, int bd) {
int txfm_buf[16 * 8 + 8 + 8];
inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X8, bd);
}
void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output,
int stride, int tx_type, int bd) {
int txfm_buf[16 * 32 + 32 + 32];
inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X32, bd);
}
void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output,
int stride, int tx_type, int bd) {
int txfm_buf[32 * 16 + 16 + 16];
inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X16, bd);
}
void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output,
......@@ -225,5 +273,5 @@ void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output,
// int16_t*
TXFM_2D_FLIP_CFG cfg = av1_get_inv_txfm_64x64_cfg(tx_type);
inv_txfm2d_add_c(input, (int16_t *)output, stride, &cfg, txfm_buf);
clamp_block((int16_t *)output, 64, stride, 0, (1 << bd) - 1);
clamp_block((int16_t *)output, 64, 64, stride, 0, (1 << bd) - 1);
}
......@@ -255,6 +255,12 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
#inv txfm
add_proto qw/void av1_inv_txfm2d_add_4x8/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
add_proto qw/void av1_inv_txfm2d_add_8x4/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
add_proto qw/void av1_inv_txfm2d_add_8x16/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
add_proto qw/void av1_inv_txfm2d_add_16x8/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
add_proto qw/void av1_inv_txfm2d_add_16x32/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
add_proto qw/void av1_inv_txfm2d_add_32x16/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
add_proto qw/void av1_inv_txfm2d_add_4x4/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
specialize qw/av1_inv_txfm2d_add_4x4 sse4_1/;
add_proto qw/void av1_inv_txfm2d_add_8x8/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
......@@ -405,12 +411,21 @@ add_proto qw/void av1_fwd_idtx/, "const int16_t *src_diff, tran_low_t *coeff, in
if (aom_config("CONFIG_DPCM_INTRA") eq "yes") {
@sizes = (4, 8, 16, 32);
foreach $size (@sizes) {
if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
add_proto "void", "av1_hbd_dpcm_ft$size", "const int16_t *input, int stride, TX_TYPE_1D tx_type, tran_low_t *output, int dir";
}
add_proto "void", "av1_dpcm_ft$size", "const int16_t *input, int stride, TX_TYPE_1D tx_type, tran_low_t *output";
}
}
if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
#fwd txfm
add_proto qw/void av1_fwd_txfm2d_4x8/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
add_proto qw/void av1_fwd_txfm2d_8x4/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
add_proto qw/void av1_fwd_txfm2d_8x16/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
add_proto qw/void av1_fwd_txfm2d_16x8/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
add_proto qw/void av1_fwd_txfm2d_16x32/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
add_proto qw/void av1_fwd_txfm2d_32x16/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
add_proto qw/void av1_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
specialize qw/av1_fwd_txfm2d_4x4 sse4_1/;
add_proto qw/void av1_fwd_txfm2d_8x8/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
......@@ -484,35 +499,6 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
}
# fdct functions
add_proto qw/void av1_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_highbd_fht4x4 sse4_1/;
add_proto qw/void av1_highbd_fht4x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
add_proto qw/void av1_highbd_fht8x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
add_proto qw/void av1_highbd_fht8x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
add_proto qw/void av1_highbd_fht16x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
add_proto qw/void av1_highbd_fht16x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
add_proto qw/void av1_highbd_fht32x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
add_proto qw/void av1_highbd_fht4x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
add_proto qw/void av1_highbd_fht16x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
add_proto qw/void av1_highbd_fht8x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
add_proto qw/void av1_highbd_fht32x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
add_proto qw/void av1_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
add_proto qw/void av1_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
add_proto qw/void av1_highbd_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
if (aom_config("CONFIG_TX64X64") eq "yes") {
add_proto qw/void av1_highbd_fht64x64/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
}
......
......@@ -120,11 +120,12 @@ static INLINE int get_max_bit(int x) {
}
// TODO(angiebird): implement SSE
static INLINE void clamp_block(int16_t *block, int block_size, int stride,
int low, int high) {
static INLINE void clamp_block(int16_t *block, int block_size_row,
int block_size_col, int stride, int low,
int high) {
int i, j;
for (i = 0; i < block_size; ++i) {
for (j = 0; j < block_size; ++j) {
for (i = 0; i < block_size_row; ++i) {
for (j = 0; j < block_size_col; ++j) {
block[i * stride + j] = clamp(block[i * stride + j], low, high);
}
}
......
This diff is collapsed.
......@@ -98,16 +98,20 @@ typedef void (*dpcm_inv_txfm_add_func)(const tran_low_t *input, int stride,
dpcm_inv_txfm_add_func av1_get_dpcm_inv_txfm_add_func(int tx_length);
#if CONFIG_HIGHBITDEPTH
void av1_hbd_dpcm_inv_txfm_add_4_c(const tran_low_t *input, int stride,
TX_TYPE_1D tx_type, int bd, uint16_t *dest);
TX_TYPE_1D tx_type, int bd, uint16_t *dest,
int dir);
void av1_hbd_dpcm_inv_txfm_add_8_c(const tran_low_t *input, int stride,
TX_TYPE_1D tx_type, int bd, uint16_t *dest);
TX_TYPE_1D tx_type, int bd, uint16_t *dest,
int dir);
void av1_hbd_dpcm_inv_txfm_add_16_c(const tran_low_t *input, int stride,
TX_TYPE_1D tx_type, int bd, uint16_t *dest);
TX_TYPE_1D tx_type, int bd, uint16_t *dest,
int dir);
void av1_hbd_dpcm_inv_txfm_add_32_c(const tran_low_t *input, int stride,
TX_TYPE_1D tx_type, int bd, uint16_t *dest);
TX_TYPE_1D tx_type, int bd, uint16_t *dest,
int dir);
typedef void (*hbd_dpcm_inv_txfm_add_func)(const tran_low_t *input, int stride,
TX_TYPE_1D tx_type, int bd,
uint16_t *dest);
uint16_t *dest, int dir);
hbd_dpcm_inv_txfm_add_func av1_get_hbd_dpcm_inv_txfm_add_func(int tx_length);
#endif // CONFIG_HIGHBITDEPTH
#endif // CONFIG_DPCM_INTRA
......
......@@ -40,7 +40,11 @@ static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
const int stride,
const TXFM_2D_FLIP_CFG *cfg,
int32_t *txfm_buf) {
// TODO(sarahparker) must correct for rectangular transforms in follow up
// TODO(sarahparker) This does not currently support rectangular transforms
// and will break without splitting txfm_size out into row and col size.
// Rectangular transforms use c code only, so it should be ok for now.
// It will be corrected when there are sse implementations for rectangular
// transforms.
const int txfm_size = cfg->row_cfg->txfm_size;
const int8_t *shift = cfg->row_cfg->shift;
const int8_t *stage_range_col = cfg->col_cfg->stage_range;
......
......@@ -64,7 +64,7 @@ static INLINE void transpose_32_4x4(int stride, const __m128i *input,
// the entire input block can be represent by a grid of 4x4 blocks
// each 4x4 blocks can be represent by 4 vertical __m128i
// we first transpose each 4x4 block internally
// than transpose the grid
// then transpose the grid
static INLINE void transpose_32(int txfm_size, const __m128i *input,
__m128i *output) {
const int num_per_128 = 4;
......
......@@ -562,7 +562,7 @@ static void hbd_process_block_dpcm_vert(TX_SIZE tx_size, TX_TYPE_1D tx_type_1d,
av1_get_hbd_dpcm_inv_txfm_add_func(tx1d_width);
for (int r = 0; r < tx1d_height; ++r) {
if (r > 0) memcpy(dst, dst - dst_stride, tx1d_width * sizeof(dst[0]));
inverse_tx(dqcoeff, 1, tx_type_1d, bd, dst);
inverse_tx(dqcoeff, 1, tx_type_1d, bd, dst, 1);
dqcoeff += tx1d_width;
dst += dst_stride;
}
......@@ -590,7 +590,7 @@ static void hbd_process_block_dpcm_horz(TX_SIZE tx_size, TX_TYPE_1D tx_type_1d,
if (c > 0) dst[r * dst_stride] = dst[r * dst_stride - 1];
tx_buff[r] = dqcoeff[r * tx1d_width];
}
inverse_tx(tx_buff, dst_stride, tx_type_1d, bd, dst);
inverse_tx(tx_buff, dst_stride, tx_type_1d, bd, dst, 0);
}
}
#endif // CONFIG_HIGHBITDEPTH
......
......@@ -1990,75 +1990,10 @@ void av1_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
}
#if CONFIG_HIGHBITDEPTH
void av1_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
int tx_type) {
av1_fht4x4_c(input, output, stride, tx_type);
}
void av1_highbd_fht4x8_c(const int16_t *input, tran_low_t *output, int stride,
int tx_type) {
av1_fht4x8_c(input, output, stride, tx_type);
}
void av1_highbd_fht8x4_c(const int16_t *input, tran_low_t *output, int stride,
int tx_type) {
av1_fht8x4_c(input, output, stride, tx_type);
}
void av1_highbd_fht8x16_c(const int16_t *input, tran_low_t *output, int stride,
int tx_type) {
av1_fht8x16_c(input, output, stride, tx_type);
}
void av1_highbd_fht16x8_c(const int16_t *input, tran_low_t *output, int stride,
int tx_type) {
av1_fht16x8_c(input, output, stride, tx_type);
}
void av1_highbd_fht16x32_c(const int16_t *input, tran_low_t *output, int stride,
int tx_type) {
av1_fht16x32_c(input, output, stride, tx_type);
}
void av1_highbd_fht32x16_c(const int16_t *input, tran_low_t *output, int stride,
int tx_type) {
av1_fht32x16_c(input, output, stride, tx_type);
}
void av1_highbd_fht4x16_c(const int16_t *input, tran_low_t *output, int stride,
int tx_type) {
av1_fht4x16_c(input, output, stride, tx_type);
}
void av1_highbd_fht16x4_c(const int16_t *input, tran_low_t *output, int stride,
int tx_type) {
av1_fht16x4_c(input, output, stride, tx_type);
}
void av1_highbd_fht8x32_c(const int16_t *input, tran_low_t *output, int stride,
int tx_type) {
av1_fht8x32_c(input, output, stride, tx_type);
}
void av1_highbd_fht32x8_c(const int16_t *input, tran_low_t *output, int stride,
int tx_type) {
av1_fht32x8_c(input, output, stride, tx_type);
}
void av1_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
int tx_type) {
av1_fht8x8_c(input, output, stride, tx_type);
}
void av1_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output,
int stride) {
av1_fwht4x4_c(input, output, stride);
}
void av1_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
int tx_type) {
av1_fht16x16_c(input, output, stride, tx_type);