Commit 1fc3df55 authored by Debargha Mukherjee's avatar Debargha Mukherjee

Stop using VP9 convolve scheme in AV1 encoder.

Discontinue all VP9 style convolve rounding operations in the non-normative
parts of the encoder.

The function av1_convolve_2d_sr_c is forced instead of SIMD versions
of the same function, because of incompatibility when round_1 > 0.

In the -DCONFIG_LOWPRECISION_BLEND=2 -DCONFIG_HIGHPRECISION_INTBUF=1
setting, results on 15 frames of lowres (cpu-used=1) is -0.019% better.

Change-Id: I72154bd896357c352c944fb2cd3b25bafafba46a
parent aa5904ba
......@@ -453,19 +453,16 @@ if ($opts{config} !~ /libs-x86-win32-vs.*/) {
add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
if (aom_config("CONFIG_JNT_COMP") eq "yes") {
if (aom_config("CONFIG_JNT_COMP") eq "yes") {
specialize qw/av1_warp_affine sse4_1/;
}
specialize qw/av1_warp_affine sse4_1/;
} else {
specialize qw/av1_warp_affine sse2 ssse3/;
}
add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
if (aom_config("CONFIG_JNT_COMP") eq "yes") {
if (aom_config("CONFIG_JNT_COMP") eq "yes") {
specialize qw/av1_highbd_warp_affine sse4_1/;
}
specialize qw/av1_highbd_warp_affine sse4_1/;
} else {
specialize qw/av1_highbd_warp_affine ssse3/;
}
......
......@@ -1159,9 +1159,6 @@ void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
const int subpel_y_q4, int y_step_q4,
int scaled, ConvolveParams *conv_params,
int bd) {
(void)dst;
(void)dst_stride;
InterpFilterParams filter_params_x, filter_params_y;
#if CONFIG_SHORT_FILTER
av1_get_convolve_filter_params(interp_filters, &filter_params_x,
......@@ -1172,71 +1169,101 @@ void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
#endif
const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
if (filter_params_y.taps < filter_params_x.taps) {
uint16_t tr_src[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) *
(MAX_SB_SIZE + MAX_FILTER_TAP - 1)];
int tr_src_stride = MAX_SB_SIZE + MAX_FILTER_TAP - 1;
CONV_BUF_TYPE tr_dst[MAX_SB_SIZE * MAX_SB_SIZE];
int tr_dst_stride = MAX_SB_SIZE;
int fo_vert = filter_params_y.taps / 2 - 1;
int fo_horiz = filter_params_x.taps / 2 - 1;
transpose_uint16(
tr_src, tr_src_stride, src - fo_vert * src_stride - fo_horiz,
src_stride, w + filter_params_x.taps - 1, h + filter_params_y.taps - 1);
transpose_int32(tr_dst, tr_dst_stride, conv_params->dst,
conv_params->dst_stride, w, h);
// horizontal and vertical parameters are swapped because of the transpose
if (conv_params->dst) {
if (filter_params_y.taps < filter_params_x.taps) {
uint16_t tr_src[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) *
(MAX_SB_SIZE + MAX_FILTER_TAP - 1)];
int tr_src_stride = MAX_SB_SIZE + MAX_FILTER_TAP - 1;
CONV_BUF_TYPE tr_dst[MAX_SB_SIZE * MAX_SB_SIZE];
int tr_dst_stride = MAX_SB_SIZE;
int fo_vert = filter_params_y.taps / 2 - 1;
int fo_horiz = filter_params_x.taps / 2 - 1;
transpose_uint16(tr_src, tr_src_stride,
src - fo_vert * src_stride - fo_horiz, src_stride,
w + filter_params_x.taps - 1,
h + filter_params_y.taps - 1);
transpose_int32(tr_dst, tr_dst_stride, conv_params->dst,
conv_params->dst_stride, w, h);
// horizontal and vertical parameters are swapped because of the transpose
#if CONFIG_JNT_COMP
if (scaled)
av1_highbd_convolve_2d_scale(
tr_src + fo_horiz * tr_src_stride + fo_vert, tr_src_stride, tr_dst,
tr_dst_stride, h, w, &filter_params_y, &filter_params_x, subpel_y_q4,
y_step_q4, subpel_x_q4, x_step_q4, conv_params, bd);
else
av1_highbd_jnt_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert,
tr_src_stride, tr_dst, tr_dst_stride, h, w,
&filter_params_y, &filter_params_x,
subpel_y_q4, subpel_x_q4, conv_params, bd);
if (scaled)
av1_highbd_convolve_2d_scale(
tr_src + fo_horiz * tr_src_stride + fo_vert, tr_src_stride, tr_dst,
tr_dst_stride, h, w, &filter_params_y, &filter_params_x,
subpel_y_q4, y_step_q4, subpel_x_q4, x_step_q4, conv_params, bd);
else
av1_highbd_jnt_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert,
tr_src_stride, tr_dst, tr_dst_stride, h, w,
&filter_params_y, &filter_params_x,
subpel_y_q4, subpel_x_q4, conv_params, bd);
#else
if (scaled)
av1_highbd_convolve_2d_scale(
tr_src + fo_horiz * tr_src_stride + fo_vert, tr_src_stride, tr_dst,
tr_dst_stride, h, w, &filter_params_y, &filter_params_x, subpel_y_q4,
y_step_q4, subpel_x_q4, x_step_q4, conv_params, bd);
else
av1_highbd_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert,
tr_src_stride, tr_dst, tr_dst_stride, h, w,
&filter_params_y, &filter_params_x, subpel_y_q4,
subpel_x_q4, conv_params, bd);
if (scaled)
av1_highbd_convolve_2d_scale(
tr_src + fo_horiz * tr_src_stride + fo_vert, tr_src_stride, tr_dst,
tr_dst_stride, h, w, &filter_params_y, &filter_params_x,
subpel_y_q4, y_step_q4, subpel_x_q4, x_step_q4, conv_params, bd);
else
av1_highbd_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert,
tr_src_stride, tr_dst, tr_dst_stride, h, w,
&filter_params_y, &filter_params_x, subpel_y_q4,
subpel_x_q4, conv_params, bd);
#endif // CONFIG_JNT_COMP
transpose_int32(conv_params->dst, conv_params->dst_stride, tr_dst,
tr_dst_stride, h, w);
} else {
#if CONFIG_JNT_COMP
if (scaled)
av1_highbd_convolve_2d_scale(
src, src_stride, conv_params->dst, conv_params->dst_stride, w, h,
&filter_params_x, &filter_params_y, subpel_x_q4, x_step_q4,
subpel_y_q4, y_step_q4, conv_params, bd);
else
av1_highbd_jnt_convolve_2d(src, src_stride, conv_params->dst,
conv_params->dst_stride, w, h,
&filter_params_x, &filter_params_y,
subpel_x_q4, subpel_y_q4, conv_params, bd);
#else
if (scaled)
av1_highbd_convolve_2d_scale(
src, src_stride, conv_params->dst, conv_params->dst_stride, w, h,
&filter_params_x, &filter_params_y, subpel_x_q4, x_step_q4,
subpel_y_q4, y_step_q4, conv_params, bd);
else
av1_highbd_convolve_2d(src, src_stride, conv_params->dst,
conv_params->dst_stride, w, h, &filter_params_x,
&filter_params_y, subpel_x_q4, subpel_y_q4,
conv_params, bd);
#endif // CONFIG_JNT_COMP
transpose_int32(conv_params->dst, conv_params->dst_stride, tr_dst,
tr_dst_stride, h, w);
}
} else {
CONV_BUF_TYPE tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE];
int tmp_dst_stride = MAX_SB_SIZE;
#if CONFIG_JNT_COMP
if (scaled)
av1_highbd_convolve_2d_scale(
src, src_stride, conv_params->dst, conv_params->dst_stride, w, h,
&filter_params_x, &filter_params_y, subpel_x_q4, x_step_q4,
subpel_y_q4, y_step_q4, conv_params, bd);
av1_highbd_convolve_2d_scale(src, src_stride, tmp_dst, tmp_dst_stride, w,
h, &filter_params_x, &filter_params_y,
subpel_x_q4, x_step_q4, subpel_y_q4,
y_step_q4, conv_params, bd);
else
av1_highbd_jnt_convolve_2d(src, src_stride, conv_params->dst,
conv_params->dst_stride, w, h,
av1_highbd_jnt_convolve_2d(src, src_stride, tmp_dst, tmp_dst_stride, w, h,
&filter_params_x, &filter_params_y,
subpel_x_q4, subpel_y_q4, conv_params, bd);
#else
if (scaled)
av1_highbd_convolve_2d_scale(
src, src_stride, conv_params->dst, conv_params->dst_stride, w, h,
&filter_params_x, &filter_params_y, subpel_x_q4, x_step_q4,
subpel_y_q4, y_step_q4, conv_params, bd);
av1_highbd_convolve_2d_scale(src, src_stride, tmp_dst, tmp_dst_stride, w,
h, &filter_params_x, &filter_params_y,
subpel_x_q4, x_step_q4, subpel_y_q4,
y_step_q4, conv_params, bd);
else
av1_highbd_convolve_2d(src, src_stride, conv_params->dst,
conv_params->dst_stride, w, h, &filter_params_x,
&filter_params_y, subpel_x_q4, subpel_y_q4,
conv_params, bd);
av1_highbd_convolve_2d(src, src_stride, tmp_dst, tmp_dst_stride, w, h,
&filter_params_x, &filter_params_y, subpel_x_q4,
subpel_y_q4, conv_params, bd);
#endif // CONFIG_JNT_COMP
// 0-bit rounding just to convert from int32 to uint16
av1_highbd_convolve_rounding(tmp_dst, tmp_dst_stride, dst, dst_stride, w, h,
0, bd);
}
}
......
......@@ -60,27 +60,6 @@ typedef void (*aom_convolve_fn_t)(const uint8_t *src, int src_stride,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params);
static INLINE ConvolveParams get_conv_params(int ref, int do_average, int plane,
int bd) {
ConvolveParams conv_params;
conv_params.ref = ref;
conv_params.do_average = do_average;
conv_params.round = CONVOLVE_OPT_ROUND;
conv_params.plane = plane;
conv_params.do_post_rounding = 0;
conv_params.round_0 = ROUND0_BITS;
conv_params.round_1 = 0;
conv_params.is_compound = 0;
conv_params.dst = NULL;
conv_params.dst_stride = 0;
const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2;
if (bd < 12) assert(intbufrange <= 16);
if (intbufrange > 16) {
conv_params.round_0 += intbufrange - 16;
}
return conv_params;
}
static INLINE void av1_get_convolve_filter_params(InterpFilters interp_filters,
InterpFilterParams *params_x,
InterpFilterParams *params_y
......@@ -107,6 +86,7 @@ static INLINE void av1_get_convolve_filter_params(InterpFilters interp_filters,
struct AV1Common;
struct scale_factors;
void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
InterpFilters interp_filters, const int subpel_x_q4,
......@@ -114,6 +94,27 @@ void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
int scaled, ConvolveParams *conv_params,
const struct scale_factors *sf);
static INLINE ConvolveParams get_conv_params_round(int ref, int do_average,
int plane, int bd) {
ConvolveParams conv_params;
conv_params.ref = ref;
conv_params.do_average = do_average;
conv_params.plane = plane;
conv_params.round = CONVOLVE_OPT_ROUND;
conv_params.round_0 = ROUND0_BITS;
conv_params.round_1 = 0;
conv_params.do_post_rounding = 0;
conv_params.is_compound = 0;
conv_params.dst = NULL;
conv_params.dst_stride = 0;
const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2;
if (bd < 12) assert(intbufrange <= 16);
if (intbufrange > 16) {
conv_params.round_0 += intbufrange - 16;
}
return conv_params;
}
static INLINE ConvolveParams get_conv_params_no_round(int ref, int do_average,
int plane, int32_t *dst,
int dst_stride,
......@@ -125,7 +126,8 @@ static INLINE ConvolveParams get_conv_params_no_round(int ref, int do_average,
conv_params.is_compound = is_compound;
conv_params.round_0 = ROUND0_BITS;
#if CONFIG_LOWPRECISION_BLEND
conv_params.round_1 = is_compound ? COMPOUND_ROUND1_BITS : 0;
conv_params.round_1 = is_compound ? COMPOUND_ROUND1_BITS
: 2 * FILTER_BITS - conv_params.round_0;
#else
conv_params.round_1 = 0;
#endif
......@@ -145,6 +147,11 @@ static INLINE ConvolveParams get_conv_params_no_round(int ref, int do_average,
return conv_params;
}
static INLINE ConvolveParams get_conv_params(int ref, int do_average, int plane,
int bd) {
return get_conv_params_no_round(ref, do_average, plane, NULL, 0, 0, bd);
}
void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
uint8_t *dst, int dst_stride, int w, int h,
InterpFilters interp_filters,
......
......@@ -58,7 +58,7 @@ typedef struct mv32 {
#define WARP_PARAM_REDUCE_BITS 6
// Precision bits reduction after horizontal shear
#define HORSHEAR_REDUCE_PREC_BITS 5
#define HORSHEAR_REDUCE_PREC_BITS 3
#define VERSHEAR_REDUCE_PREC_BITS \
(2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)
......
......@@ -185,7 +185,7 @@ void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
// subpel_y_q4 == 0
sf->convolve[1][0][0] = av1_convolve_x_sr;
// subpel_x_q4 != 0 && subpel_y_q4 != 0
sf->convolve[1][1][0] = av1_convolve_2d_sr;
sf->convolve[1][1][0] = av1_convolve_2d_sr_c;
#if CONFIG_JNT_COMP
// subpel_x_q4 == 0 && subpel_y_q4 == 0
sf->convolve[0][0][1] = av1_jnt_convolve_2d_copy;
......
......@@ -422,19 +422,24 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
ConvolveParams *conv_params, int16_t alpha,
int16_t beta, int16_t gamma, int16_t delta) {
int32_t tmp[15 * 8];
const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
const int reduce_bits_horiz =
const int use_conv_params =
(conv_params->round == CONVOLVE_OPT_NO_ROUND && conv_params->dst);
int reduce_bits_horiz =
use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
if (!use_conv_params &&
bd + WARPEDPIXEL_FILTER_BITS + 2 - reduce_bits_horiz > 16)
reduce_bits_horiz += bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 14;
const int reduce_bits_vert =
use_conv_params ? conv_params->round_1
: 2 * WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz;
const int max_bits_horiz =
use_conv_params
? bd + FILTER_BITS + 1 - conv_params->round_0
: bd + WARPEDPIXEL_FILTER_BITS + 1 - HORSHEAR_REDUCE_PREC_BITS;
use_conv_params ? bd + FILTER_BITS + 1 - conv_params->round_0
: bd + WARPEDPIXEL_FILTER_BITS + 1 - reduce_bits_horiz;
const int offset_bits_horiz =
use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1;
const int offset_bits_vert =
use_conv_params
? bd + 2 * FILTER_BITS - conv_params->round_0
: bd + 2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS;
use_conv_params ? bd + 2 * FILTER_BITS - conv_params->round_0
: bd + 2 * WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz;
if (use_conv_params) {
conv_params->do_post_rounding = 1;
}
......@@ -534,7 +539,7 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
} else {
uint16_t *p =
&pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
sum = ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS);
sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
assert(0 <= sum && sum < (1 << (bd + 2)));
uint16_t px =
clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd);
......@@ -719,9 +724,13 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
int16_t gamma, int16_t delta) {
int32_t tmp[15 * 8];
const int bd = 8;
const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
const int use_conv_params =
(conv_params->round == CONVOLVE_OPT_NO_ROUND && conv_params->dst);
const int reduce_bits_horiz =
use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
const int reduce_bits_vert =
use_conv_params ? conv_params->round_1
: 2 * WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz;
const int max_bits_horiz =
use_conv_params
? bd + FILTER_BITS + 1 - conv_params->round_0
......@@ -837,7 +846,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
} else {
uint8_t *p =
&pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
sum = ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS);
sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
assert(0 <= sum && sum < (1 << (bd + 2)));
uint8_t px = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
if (conv_params->do_average)
......
......@@ -140,7 +140,7 @@ static INLINE void cal_rounding_4_sse2(const int32_t *src, uint8_t *dst,
void av1_convolve_rounding_avx2(const int32_t *src, int src_stride,
uint8_t *dst, int dst_stride, int w, int h,
int bits) {
const __m256i rnd_num = _mm256_set1_epi32((int32_t)(1 << (bits - 1)));
const __m256i rnd_num = _mm256_set1_epi32((int32_t)((1 << bits) >> 1));
const __m128i rnd_num_sse2 = _mm256_castsi256_si128(rnd_num);
if (w > 64) { // width = 128
......@@ -283,7 +283,7 @@ void av1_highbd_convolve_rounding_avx2(const int32_t *src, int src_stride,
uint8_t *dst8, int dst_stride, int w,
int h, int bits, int bd) {
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
const __m256i rnd_num = _mm256_set1_epi32((int32_t)(1 << (bits - 1)));
const __m256i rnd_num = _mm256_set1_epi32((int32_t)((1 << bits) >> 1));
const __m128i rnd_num_sse2 = _mm256_castsi256_si128(rnd_num);
if (w > 64) { // width = 128
......
......@@ -22,15 +22,18 @@ void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
ConvolveParams *conv_params, int16_t alpha,
int16_t beta, int16_t gamma, int16_t delta) {
int comp_avg = conv_params->do_average;
#if HORSHEAR_REDUCE_PREC_BITS >= 5
__m128i tmp[15];
#else
#error "HORSHEAR_REDUCE_PREC_BITS < 5 not currently supported by SSSE3 filter"
#endif
int i, j, k;
const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
const int reduce_bits_horiz =
const int use_conv_params =
(conv_params->round == CONVOLVE_OPT_NO_ROUND && conv_params->dst);
int reduce_bits_horiz =
use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
if (!use_conv_params &&
bd + WARPEDPIXEL_FILTER_BITS + 2 - reduce_bits_horiz > 16)
reduce_bits_horiz += bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 14;
const int reduce_bits_vert =
use_conv_params ? conv_params->round_1
: 2 * WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz;
const int offset_bits_horiz =
use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1;
if (use_conv_params) {
......@@ -91,10 +94,9 @@ void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
else if (iy > height - 1)
iy = height - 1;
tmp[k + 7] = _mm_set1_epi16(
(1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
1)) +
(1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) +
ref[iy * stride] *
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
(1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz)));
}
} else if (ix4 >= width + 6) {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
......@@ -104,10 +106,9 @@ void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
else if (iy > height - 1)
iy = height - 1;
tmp[k + 7] = _mm_set1_epi16(
(1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
1)) +
(1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) +
ref[iy * stride + (width - 1)] *
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
(1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz)));
}
} else {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
......@@ -361,13 +362,13 @@ void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
} else {
// Round and pack into 8 bits
const __m128i round_const =
_mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
_mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
((1 << reduce_bits_vert) >> 1));
const __m128i res_lo_round = _mm_srai_epi32(
_mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
_mm_add_epi32(res_lo, round_const), reduce_bits_vert);
const __m128i res_hi_round = _mm_srai_epi32(
_mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
_mm_add_epi32(res_hi, round_const), reduce_bits_vert);
__m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
// Clamp res_16bit to the range [0, 2^bd - 1]
......
......@@ -22,21 +22,25 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
ConvolveParams *conv_params, int16_t alpha,
int16_t beta, int16_t gamma, int16_t delta) {
int comp_avg = conv_params->do_average;
#if HORSHEAR_REDUCE_PREC_BITS >= 5
__m128i tmp[15];
#else
#error "HORSHEAR_REDUCE_PREC_BITS < 5 not currently supported by SSSE3 filter"
#endif
int i, j, k;
const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
const int reduce_bits_horiz =
const int use_conv_params =
(conv_params->round == CONVOLVE_OPT_NO_ROUND && conv_params->dst);
int reduce_bits_horiz =
use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
if (!use_conv_params &&
bd + WARPEDPIXEL_FILTER_BITS + 2 - reduce_bits_horiz > 16)
reduce_bits_horiz += bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 14;
const int reduce_bits_vert =
use_conv_params ? conv_params->round_1
: 2 * WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz;
const int offset_bits_horiz =
use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1;
if (use_conv_params) {
conv_params->do_post_rounding = 1;
}
assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
if (bd == 12 && reduce_bits_horiz < 5) printf("Error\n");
/* Note: For this code to work, the left/right frame borders need to be
extended by at least 13 pixels each. By the time we get here, other
......@@ -85,10 +89,9 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
else if (iy > height - 1)
iy = height - 1;
tmp[k + 7] = _mm_set1_epi16(
(1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
1)) +
(1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) +
ref[iy * stride] *
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
(1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz)));
}
} else if (ix4 >= width + 6) {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
......@@ -98,10 +101,9 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
else if (iy > height - 1)
iy = height - 1;
tmp[k + 7] = _mm_set1_epi16(
(1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
1)) +
(1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) +
ref[iy * stride + (width - 1)] *
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
(1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz)));
}
} else {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
......@@ -320,13 +322,13 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
} else {
// Round and pack into 8 bits
const __m128i round_const =
_mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
_mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
((1 << reduce_bits_vert) >> 1));
const __m128i res_lo_round = _mm_srai_epi32(
_mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
_mm_add_epi32(res_lo, round_const), reduce_bits_vert);
const __m128i res_hi_round = _mm_srai_epi32(
_mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
_mm_add_epi32(res_hi, round_const), reduce_bits_vert);
__m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
// Clamp res_16bit to the range [0, 2^bd - 1]
......
......@@ -24,7 +24,8 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
__m128i tmp[15];
int i, j, k;
const int bd = 8;
const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
const int use_conv_params =
(conv_params->round == CONVOLVE_OPT_NO_ROUND && conv_params->dst);
const int reduce_bits_horiz =
use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
const int offset_bits_horiz =
......@@ -81,10 +82,9 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
else if (iy > height - 1)
iy = height - 1;
tmp[k + 7] = _mm_set1_epi16(
(1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
1)) +
(1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) +
ref[iy * stride] *
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
(1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz)));
}
} else if (ix4 >= width + 6) {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
......@@ -94,10 +94,9 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
else if (iy > height - 1)
iy = height - 1;
tmp[k + 7] = _mm_set1_epi16(
(1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
1)) +
(1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) +
ref[iy * stride + (width - 1)] *
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
(1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz)));
}
} else {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
......
......@@ -212,7 +212,8 @@ void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
__m128i tmp[15];
int i, j, k;
const int bd = 8;
const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
const int use_conv_params =
(conv_params->round == CONVOLVE_OPT_NO_ROUND && conv_params->dst);
const int reduce_bits_horiz =
use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
const int offset_bits_horiz =
......@@ -275,10 +276,9 @@ void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
else if (iy > height - 1)
iy = height - 1;
tmp[k + 7] = _mm_set1_epi16(
(1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
1)) +
(1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) +
ref[iy * stride] *
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
(1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz)));
}
} else if (ix4 >= width + 6) {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
......@@ -288,10 +288,9 @@ void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
else if (iy > height - 1)
iy = height - 1;
tmp[k + 7] = _mm_set1_epi16(
(1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
1)) +
(1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) +
ref[iy * stride + (width - 1)] *
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
(1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz)));
}
} else {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
......
......@@ -211,7 +211,8 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
__m128i tmp[15];
int i, j, k;
const int bd = 8;
const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
const int use_conv_params =
(conv_params->round == CONVOLVE_OPT_NO_ROUND && conv_params->dst);
const int reduce_bits_horiz =
use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
const int offset_bits_horiz =
......@@ -268,10 +269,9 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
else if (iy > height - 1)
iy = height - 1;
tmp[k + 7] = _mm_set1_epi16(
(1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
1)) +
(1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) +
ref[iy * stride] *
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
(1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz)));