Commit 03a97b76 authored by Linfeng Zhang's avatar Linfeng Zhang Committed by Angie Chiang

Implement av1_lowbd_fwd_txfm2d_{4x16,16x4}_sse2

Change-Id: Ie290cd3512a3cc20b56815048e3470bd9d78abb5
parent 5fdd1caa
......@@ -183,10 +183,11 @@ static INLINE void store_buffer_16bit_to_32bit_w4(const __m128i *const in,
}
}
static INLINE void store_buffer_16bit_to_32bit_8x8(const __m128i *const in,
int32_t *const out,
const int stride) {
for (int i = 0; i < 8; ++i) {
static INLINE void store_buffer_16bit_to_32bit_w8(const __m128i *const in,
int32_t *const out,
const int stride,
const int out_size) {
for (int i = 0; i < out_size; ++i) {
store_16bit_to_32bit(in[i], out + i * stride);
}
}
......@@ -244,6 +245,9 @@ void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output,
void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output,
int stride, TX_TYPE tx_type, int bd);
void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output,
int stride, TX_TYPE tx_type, int bd);
void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output,
int stride, TX_TYPE tx_type, int bd);
......@@ -256,6 +260,9 @@ void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output,
void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output,
int stride, TX_TYPE tx_type, int bd);
void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output,
int stride, TX_TYPE tx_type, int bd);
void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output,
int stride, TX_TYPE tx_type, int bd);
......
......@@ -2162,6 +2162,48 @@ void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output,
store_rect_buffer_16bit_to_32bit_w4(buf, output, width, height);
}
void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output,
int stride, TX_TYPE tx_type, int bd) {
(void)bd;
__m128i buf0[16], buf1[16];
const int8_t *shift = fwd_txfm_shift_ls[TX_4X16];
const int txw_idx = get_txw_idx(TX_4X16);
const int txh_idx = get_txh_idx(TX_4X16);
const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
const int width = 4;
const int height = 16;
const transform_1d_sse2 col_txfm = txfm16_arr[tx_type].col;
const transform_1d_sse2 row_txfm = txfm4x8_arr[tx_type].row;
int ud_flip, lr_flip;
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
if (ud_flip) {
load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
} else {
load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
}
round_shift_16bit(buf0, height, shift[0]);
col_txfm(buf0, buf0, cos_bit_col);
round_shift_16bit(buf0, height, shift[1]);
transpose_16bit_4x8(buf0, buf1);
transpose_16bit_4x8(buf0 + 8, buf1 + 8);
for (int i = 0; i < 2; i++) {
__m128i *buf;
if (lr_flip) {
buf = buf0;
flip_buf_sse2(buf1 + 8 * i, buf, width);
} else {
buf = buf1 + 8 * i;
}
row_txfm(buf, buf, cos_bit_row);
round_shift_16bit(buf, width, shift[2]);
transpose_16bit_8x4(buf, buf);
store_buffer_16bit_to_32bit_w4(buf, output + 8 * width * i, width, 8);
}
}
void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output,
int stride, TX_TYPE tx_type, int bd) {
(void)bd;
......@@ -2233,7 +2275,7 @@ void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output,
row_txfm(buf, buf, cos_bit_row);
round_shift_16bit(buf, width, shift[2]);
transpose_16bit_8x8(buf, buf);
store_buffer_16bit_to_32bit_8x8(buf, output, width);
store_buffer_16bit_to_32bit_w8(buf, output, width, height);
}
void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output,
......@@ -2318,10 +2360,53 @@ void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output,
row_txfm(buf, buf, cos_bit_row);
round_shift_16bit(buf, width, shift[2]);
transpose_16bit_8x8(buf, buf);
store_buffer_16bit_to_32bit_8x8(buf, output + 8 * width * i, width);
store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
}
}
void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output,
int stride, TX_TYPE tx_type, int bd) {
(void)bd;
__m128i buf0[16], buf1[16];
const int8_t *shift = fwd_txfm_shift_ls[TX_16X4];
const int txw_idx = get_txw_idx(TX_16X4);
const int txh_idx = get_txh_idx(TX_16X4);
const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
const int width = 16;
const int height = 4;
const transform_1d_sse2 col_txfm = txfm8x4_arr[tx_type].col;
const transform_1d_sse2 row_txfm = txfm16_arr[tx_type].row;
__m128i *buf;
int ud_flip, lr_flip;
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
for (int i = 0; i < 2; i++) {
if (ud_flip) {
load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
} else {
load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
}
round_shift_16bit(buf0, height, shift[0]);
col_txfm(buf0, buf0, cos_bit_col);
round_shift_16bit(buf0, height, shift[1]);
transpose_16bit_8x4(buf0, buf1 + 8 * i);
}
if (lr_flip) {
buf = buf0;
flip_buf_sse2(buf1, buf, width);
} else {
buf = buf1;
}
row_txfm(buf, buf, cos_bit_row);
round_shift_16bit(buf, width, shift[2]);
transpose_16bit_4x8(buf, buf);
store_buffer_16bit_to_32bit_w8(buf, output, width, height);
transpose_16bit_4x8(buf + 8, buf + 8);
store_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height);
}
void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output,
int stride, TX_TYPE tx_type, int bd) {
(void)bd;
......@@ -2405,9 +2490,10 @@ void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output,
row_txfm(buf, buf, cos_bit_row);
round_shift_16bit(buf, width, shift[2]);
transpose_16bit_8x8(buf, buf);
store_buffer_16bit_to_32bit_8x8(buf, output + 8 * width * i, width);
store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
transpose_16bit_8x8(buf + 8, buf + 8);
store_buffer_16bit_to_32bit_8x8(buf + 8, output + 8 * width * i + 8, width);
store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
8);
}
}
......@@ -2507,16 +2593,17 @@ void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output,
row_txfm(buf, buf, cos_bit_row);
round_shift_16bit(buf, width, shift[2]);
transpose_16bit_8x8(buf, buf);
store_buffer_16bit_to_32bit_8x8(buf, output + 8 * width * i, width);
store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width,
height);
transpose_16bit_8x8(buf + 8, buf + 8);
store_buffer_16bit_to_32bit_8x8(buf + 8, output + 8 * width * i + 8,
width);
store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
height);
transpose_16bit_8x8(buf + 16, buf + 16);
store_buffer_16bit_to_32bit_8x8(buf + 16, output + 8 * width * i + 16,
width);
store_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16,
width, height);
transpose_16bit_8x8(buf + 24, buf + 24);
store_buffer_16bit_to_32bit_8x8(buf + 24, output + 8 * width * i + 24,
width);
store_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24,
width, height);
}
} else {
av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
......@@ -2626,16 +2713,16 @@ void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output,
row_txfm(buf, buf, cos_bit_row);
round_shift_16bit(buf, width, shift[2]);
transpose_16bit_8x8(buf, buf);
store_buffer_16bit_to_32bit_8x8(buf, output + 8 * width * i, width);
store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
transpose_16bit_8x8(buf + 8, buf + 8);
store_buffer_16bit_to_32bit_8x8(buf + 8, output + 8 * width * i + 8,
width);
store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
8);
transpose_16bit_8x8(buf + 16, buf + 16);
store_buffer_16bit_to_32bit_8x8(buf + 16, output + 8 * width * i + 16,
width);
store_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16,
width, 8);
transpose_16bit_8x8(buf + 24, buf + 24);
store_buffer_16bit_to_32bit_8x8(buf + 24, output + 8 * width * i + 24,
width);
store_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24,
width, 8);
}
} else {
av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd);
......@@ -2659,8 +2746,8 @@ FwdTxfm2dFuncSSE2 fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
av1_lowbd_fwd_txfm2d_32x16_sse2, // 32x16 transform
NULL, // 32x64 transform
NULL, // 64x32 transform
NULL, // 4x16 transform
NULL, // 16x4 transform
av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform
av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform
av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform
av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform
NULL, // 16x64 transform
......
......@@ -218,8 +218,8 @@ FwdTxfm2dFunc fwd_func_sse2_list[TX_SIZES_ALL][2] = {
{ av1_fwd_txfm2d_32x16_c, av1_lowbd_fwd_txfm2d_32x16_sse2 }, // TX_32X16
{ NULL, NULL }, // TX_32X64
{ NULL, NULL }, // TX_64X32
{ NULL, NULL }, // TX_4X16
{ NULL, NULL }, // TX_16X4
{ av1_fwd_txfm2d_4x16_c, av1_lowbd_fwd_txfm2d_4x16_sse2 }, // TX_4X16
{ av1_fwd_txfm2d_16x4_c, av1_lowbd_fwd_txfm2d_16x4_sse2 }, // TX_16X4
{ av1_fwd_txfm2d_8x32_c, av1_lowbd_fwd_txfm2d_8x32_sse2 }, // TX_8X32
{ av1_fwd_txfm2d_32x8_c, av1_lowbd_fwd_txfm2d_32x8_sse2 }, // TX_32X8
{ NULL, NULL }, // TX_16X64
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment