Commit d8a423c6 authored by David Barker's avatar David Barker

Add SSSE3 warp filter + const-ify warp filters

The SSSE3 filter is very similar to the SSE2 filter, but
the horizontal pass is sped up by using the 8x8->16
multiplies added in SSSE3.

Also apply const-correctness to all versions of the filter

The timings of the existing filters are unchanged, and the
lowbd SSSE3 filter is ~17% faster than the lowbd SSE2 filter.

Timings per 8x8 block:
lowbd SSE2: 320ns
lowbd SSSE3: 273ns
highbd SSSE3: 300ns

Filter output is unchanged.

Change-Id: Ifb428a33b106d900cde1b080794796c0754ae182
parent 0757fd8f
......@@ -151,8 +151,6 @@ set(AOM_AV1_ENCODER_SOURCES
"${AOM_ROOT}/av1/encoder/variance_tree.h")
set(AOM_AV1_COMMON_INTRIN_SSE2
# Requires CONFIG_GLOBAL_MOTION or CONFIG_WARPED_MOTION
#"${AOM_ROOT}/av1/common/x86/warp_plane_sse2.c"
"${AOM_ROOT}/av1/common/x86/idct_intrin_sse2.c")
set(AOM_AV1_COMMON_INTRIN_SSSE3
......@@ -382,7 +380,7 @@ if (CONFIG_PVQ)
endif ()
endif ()
if (CONFIG_WARPED_MOTION)
if (CONFIG_WARPED_MOTION OR CONFIG_GLOBAL_MOTION)
set(AOM_AV1_COMMON_SOURCES
${AOM_AV1_COMMON_SOURCES}
"${AOM_ROOT}/av1/common/warped_motion.c"
......@@ -391,6 +389,16 @@ if (CONFIG_WARPED_MOTION)
set(AOM_AV1_COMMON_INTRIN_SSE2
${AOM_AV1_COMMON_INTRIN_SSE2}
"${AOM_ROOT}/av1/common/x86/warp_plane_sse2.c")
set(AOM_AV1_COMMON_SSSE3_INTRIN
${AOM_AV1_COMMON_SSSE3_INTRIN}
"${AOM_ROOT}/av1/common/x86/warp_plane_ssse3.c")
if (CONFIG_HIGHBITDEPTH)
set(AOM_AV1_COMMON_SSSE3_INTRIN
${AOM_AV1_COMMON_SSSE3_INTRIN}
"${AOM_ROOT}/av1/common/x86/highbd_warp_plane_ssse3.c")
endif ()
endif ()
# Setup AV1 common/decoder/encoder targets. The libaom target must exist before
......
......@@ -172,6 +172,7 @@ endif
ifneq ($(findstring yes,$(CONFIG_GLOBAL_MOTION) $(CONFIG_WARPED_MOTION)),)
AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/warp_plane_sse2.c
AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/warp_plane_ssse3.c
ifeq ($(CONFIG_HIGHBITDEPTH),yes)
AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/highbd_warp_plane_ssse3.c
endif
......
......@@ -608,11 +608,11 @@ if (aom_config("CONFIG_PVQ") eq "yes") {
if ((aom_config("CONFIG_WARPED_MOTION") eq "yes") ||
(aom_config("CONFIG_GLOBAL_MOTION") eq "yes")) {
add_proto qw/void av1_warp_affine/, "int32_t *mat, uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int ref_frm, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
specialize qw/av1_warp_affine sse2/;
add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int ref_frm, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
specialize qw/av1_warp_affine sse2 ssse3/;
if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
add_proto qw/void av1_highbd_warp_affine/, "int32_t *mat, uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, int ref_frm, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, int ref_frm, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
specialize qw/av1_highbd_warp_affine ssse3/;
}
}
......
......@@ -949,9 +949,9 @@ static void highbd_warp_plane_old(WarpedMotionParams *wm, uint8_t *ref8,
//
// So, as long as HORSHEAR_REDUCE_PREC_BITS >= 5, we can safely use a 16-bit
// intermediate array.
void av1_highbd_warp_affine_c(int32_t *mat, uint16_t *ref, int width,
int height, int stride, uint16_t *pred, int p_col,
int p_row, int p_width, int p_height,
void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
int width, int height, int stride, uint16_t *pred,
int p_col, int p_row, int p_width, int p_height,
int p_stride, int subsampling_x,
int subsampling_y, int bd, int ref_frm,
int16_t alpha, int16_t beta, int16_t gamma,
......@@ -1046,7 +1046,7 @@ void av1_highbd_warp_affine_c(int32_t *mat, uint16_t *ref, int width,
// Vertical filter
for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
int sy = sy4 + gamma * (-4) + delta * k;
for (l = -4; l < 4; ++l) {
for (l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
uint16_t *p =
&pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
......@@ -1199,9 +1199,9 @@ static void warp_plane_old(WarpedMotionParams *wm, uint8_t *ref, int width,
TODO(david.barker): Maybe support scaled references?
*/
void av1_warp_affine_c(int32_t *mat, uint8_t *ref, int width, int height,
int stride, uint8_t *pred, int p_col, int p_row,
int p_width, int p_height, int p_stride,
void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
int height, int stride, uint8_t *pred, int p_col,
int p_row, int p_width, int p_height, int p_stride,
int subsampling_x, int subsampling_y, int ref_frm,
int16_t alpha, int16_t beta, int16_t gamma,
int16_t delta) {
......
......@@ -14,16 +14,14 @@
#include "./av1_rtcd.h"
#include "av1/common/warped_motion.h"
static const __m128i *const filter = (const __m128i *const)warped_filter;
/* SSE2 version of the rotzoom/affine warp filter */
void av1_highbd_warp_affine_ssse3(int32_t *mat, uint16_t *ref, int width,
int height, int stride, uint16_t *pred,
int p_col, int p_row, int p_width,
int p_height, int p_stride, int subsampling_x,
int subsampling_y, int bd, int ref_frm,
int16_t alpha, int16_t beta, int16_t gamma,
int16_t delta) {
/* SSSE3 version of the rotzoom/affine warp filter */
void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
int width, int height, int stride,
uint16_t *pred, int p_col, int p_row,
int p_width, int p_height, int p_stride,
int subsampling_x, int subsampling_y, int bd,
int ref_frm, int16_t alpha, int16_t beta,
int16_t gamma, int16_t delta) {
#if HORSHEAR_REDUCE_PREC_BITS >= 5
__m128i tmp[15];
#else
......@@ -47,8 +45,8 @@ void av1_highbd_warp_affine_ssse3(int32_t *mat, uint16_t *ref, int width,
for (j = 0; j < p_width; j += 8) {
// (x, y) coordinates of the center of this block in the destination
// image
int32_t dst_x = p_col + j + 4;
int32_t dst_y = p_row + i + 4;
const int32_t dst_x = p_col + j + 4;
const int32_t dst_y = p_row + i + 4;
int32_t x4, y4, ix4, sx4, iy4, sy4;
if (subsampling_x)
......@@ -92,55 +90,59 @@ void av1_highbd_warp_affine_ssse3(int32_t *mat, uint16_t *ref, int width,
ref[iy * stride + (width - 1)] *
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
} else {
int sx = sx4 + alpha * (-4) + beta * k +
// Include rounding and offset here
(1 << (WARPEDDIFF_PREC_BITS - 1)) +
(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
const int sx = sx4 + alpha * (-4) + beta * k +
// Include rounding and offset here
(1 << (WARPEDDIFF_PREC_BITS - 1)) +
(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
// Load source pixels
__m128i src =
const __m128i src =
_mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
__m128i src2 =
const __m128i src2 =
_mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
// Filter even-index pixels
__m128i tmp_0 = _mm_loadu_si128(
(__m128i *)(filter + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
__m128i tmp_2 = _mm_loadu_si128(
(__m128i *)(filter + ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
__m128i tmp_4 = _mm_loadu_si128(
(__m128i *)(filter + ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
__m128i tmp_6 = _mm_loadu_si128(
(__m128i *)(filter + ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
const __m128i tmp_0 = _mm_loadu_si128(
(__m128i *)(warped_filter +
((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
const __m128i tmp_2 = _mm_loadu_si128(
(__m128i *)(warped_filter +
((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
const __m128i tmp_4 = _mm_loadu_si128(
(__m128i *)(warped_filter +
((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
const __m128i tmp_6 = _mm_loadu_si128(
(__m128i *)(warped_filter +
((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
// coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
__m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
// coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
__m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
// coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
__m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
// coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
__m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
// coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
__m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
// coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
__m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
// coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
__m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
// coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
__m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
__m128i round_const =
const __m128i round_const =
_mm_set1_epi32((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1);
// Calculate filtered results
__m128i res_0 = _mm_madd_epi16(src, coeff_0);
__m128i res_2 =
const __m128i res_0 = _mm_madd_epi16(src, coeff_0);
const __m128i res_2 =
_mm_madd_epi16(_mm_alignr_epi8(src2, src, 4), coeff_2);
__m128i res_4 =
const __m128i res_4 =
_mm_madd_epi16(_mm_alignr_epi8(src2, src, 8), coeff_4);
__m128i res_6 =
const __m128i res_6 =
_mm_madd_epi16(_mm_alignr_epi8(src2, src, 12), coeff_6);
__m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
......@@ -149,32 +151,36 @@ void av1_highbd_warp_affine_ssse3(int32_t *mat, uint16_t *ref, int width,
HORSHEAR_REDUCE_PREC_BITS);
// Filter odd-index pixels
__m128i tmp_1 = _mm_loadu_si128(
(__m128i *)(filter + ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
__m128i tmp_3 = _mm_loadu_si128(
(__m128i *)(filter + ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
__m128i tmp_5 = _mm_loadu_si128(
(__m128i *)(filter + ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
__m128i tmp_7 = _mm_loadu_si128(
(__m128i *)(filter + ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
__m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
__m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
__m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
__m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
__m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
__m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
__m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
__m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
__m128i res_1 =
const __m128i tmp_1 = _mm_loadu_si128(
(__m128i *)(warped_filter +
((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
const __m128i tmp_3 = _mm_loadu_si128(
(__m128i *)(warped_filter +
((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
const __m128i tmp_5 = _mm_loadu_si128(
(__m128i *)(warped_filter +
((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
const __m128i tmp_7 = _mm_loadu_si128(
(__m128i *)(warped_filter +
((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
const __m128i res_1 =
_mm_madd_epi16(_mm_alignr_epi8(src2, src, 2), coeff_1);
__m128i res_3 =
const __m128i res_3 =
_mm_madd_epi16(_mm_alignr_epi8(src2, src, 6), coeff_3);
__m128i res_5 =
const __m128i res_5 =
_mm_madd_epi16(_mm_alignr_epi8(src2, src, 10), coeff_5);
__m128i res_7 =
const __m128i res_7 =
_mm_madd_epi16(_mm_alignr_epi8(src2, src, 14), coeff_7);
__m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
......@@ -191,100 +197,108 @@ void av1_highbd_warp_affine_ssse3(int32_t *mat, uint16_t *ref, int width,
// Vertical filter
for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
int sy = sy4 + gamma * (-4) + delta * k +
(1 << (WARPEDDIFF_PREC_BITS - 1)) +
(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
const int sy = sy4 + gamma * (-4) + delta * k +
(1 << (WARPEDDIFF_PREC_BITS - 1)) +
(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
// Load from tmp and rearrange pairs of consecutive rows into the
// column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
__m128i *src = tmp + (k + 4);
__m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
__m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
__m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
__m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
const __m128i *src = tmp + (k + 4);
const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
// Filter even-index pixels
__m128i tmp_0 = _mm_loadu_si128(
(__m128i *)(filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
__m128i tmp_2 = _mm_loadu_si128(
(__m128i *)(filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
__m128i tmp_4 = _mm_loadu_si128(
(__m128i *)(filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
__m128i tmp_6 = _mm_loadu_si128(
(__m128i *)(filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
__m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
__m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
__m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
__m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
__m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
__m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
__m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
__m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
__m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
__m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
__m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
__m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
__m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
_mm_add_epi32(res_4, res_6));
const __m128i tmp_0 = _mm_loadu_si128(
(__m128i *)(warped_filter +
((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
const __m128i tmp_2 = _mm_loadu_si128(
(__m128i *)(warped_filter +
((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
const __m128i tmp_4 = _mm_loadu_si128(
(__m128i *)(warped_filter +
((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
const __m128i tmp_6 = _mm_loadu_si128(
(__m128i *)(warped_filter +
((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
_mm_add_epi32(res_4, res_6));
// Filter odd-index pixels
__m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
__m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
__m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
__m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
__m128i tmp_1 = _mm_loadu_si128(
(__m128i *)(filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
__m128i tmp_3 = _mm_loadu_si128(
(__m128i *)(filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
__m128i tmp_5 = _mm_loadu_si128(
(__m128i *)(filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
__m128i tmp_7 = _mm_loadu_si128(
(__m128i *)(filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
__m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
__m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
__m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
__m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
__m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
__m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
__m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
__m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
__m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
__m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
__m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
__m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
__m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
_mm_add_epi32(res_5, res_7));
const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
const __m128i tmp_1 = _mm_loadu_si128(
(__m128i *)(warped_filter +
((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
const __m128i tmp_3 = _mm_loadu_si128(
(__m128i *)(warped_filter +
((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
const __m128i tmp_5 = _mm_loadu_si128(
(__m128i *)(warped_filter +
((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
const __m128i tmp_7 = _mm_loadu_si128(
(__m128i *)(warped_filter +
((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
_mm_add_epi32(res_5, res_7));
// Rearrange pixels back into the order 0 ... 7
__m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
__m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
// Round and pack into 8 bits
__m128i round_const =
const __m128i round_const =
_mm_set1_epi32((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1);
__m128i res_lo_round = _mm_srai_epi32(
const __m128i res_lo_round = _mm_srai_epi32(
_mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
__m128i res_hi_round = _mm_srai_epi32(
const __m128i res_hi_round = _mm_srai_epi32(
_mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
__m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
// Clamp res_16bit to the range [0, 2^bd - 1]
__m128i max_val = _mm_set1_epi16((1 << bd) - 1);
__m128i zero = _mm_setzero_si128();
const __m128i max_val = _mm_set1_epi16((1 << bd) - 1);
const __m128i zero = _mm_setzero_si128();
res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero);
// Store, blending with 'pred' if needed
__m128i *p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
__m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
// Note: If we're outputting a 4x4 block, we need to be very careful
// to only output 4 pixels at this point, to avoid encode/decode
......
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
* Copyright (c) 2017, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
......@@ -14,12 +14,10 @@
#include "./av1_rtcd.h"
#include "av1/common/warped_motion.h"
static const __m128i *const filter = (const __m128i *const)warped_filter;
/* SSE2 version of the rotzoom/affine warp filter */
void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
int stride, uint8_t *pred, int p_col, int p_row,
int p_width, int p_height, int p_stride,
void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
int height, int stride, uint8_t *pred, int p_col,
int p_row, int p_width, int p_height, int p_stride,
int subsampling_x, int subsampling_y, int ref_frm,
int16_t alpha, int16_t beta, int16_t gamma,
int16_t delta) {
......@@ -42,8 +40,8 @@ void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
for (j = 0; j < p_width; j += 8) {
// (x, y) coordinates of the center of this block in the destination
// image
int32_t dst_x = p_col + j + 4;
int32_t dst_y = p_row + i + 4;
const int32_t dst_x = p_col + j + 4;
const int32_t dst_y = p_row + i + 4;
int32_t x4, y4, ix4, sx4, iy4, sy4;
if (subsampling_x)
......@@ -87,56 +85,60 @@ void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
ref[iy * stride + (width - 1)] *
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
} else {
int sx = sx4 + alpha * (-4) + beta * k +
// Include rounding and offset here
(1 << (WARPEDDIFF_PREC_BITS - 1)) +
(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
const int sx = sx4 + alpha * (-4) + beta * k +
// Include rounding and offset here
(1 << (WARPEDDIFF_PREC_BITS - 1)) +
(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
// Load source pixels
__m128i zero = _mm_setzero_si128();
__m128i src =
const __m128i zero = _mm_setzero_si128();
const __m128i src =
_mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
// Filter even-index pixels
__m128i tmp_0 = _mm_loadu_si128(
(__m128i *)(filter + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
__m128i tmp_2 = _mm_loadu_si128(
(__m128i *)(filter + ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
__m128i tmp_4 = _mm_loadu_si128(
(__m128i *)(filter + ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
__m128i tmp_6 = _mm_loadu_si128(
(__m128i *)(filter + ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
const __m128i tmp_0 = _mm_loadu_si128(
(__m128i *)(warped_filter +
((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
const __m128i tmp_2 = _mm_loadu_si128(
(__m128i *)(warped_filter +
((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
const __m128i tmp_4 = _mm_loadu_si128(
(__m128i *)(warped_filter +
((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
const __m128i tmp_6 = _mm_loadu_si128(
(__m128i *)(warped_filter +
((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
// coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
__m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
// coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
__m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
// coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
__m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
// coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
__m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
// coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
__m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
// coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
__m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
// coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
__m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
// coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
__m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
__m128i round_const =
const __m128i round_const =
_mm_set1_epi32((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1);
// Calculate filtered results
__m128i src_0 = _mm_unpacklo_epi8(src, zero);
__m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
__m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(src, 2), zero);
__m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
__m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(src, 4), zero);
__m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
__m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(src, 6), zero);
__m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
const __m128i src_0 = _mm_unpacklo_epi8(src, zero);
const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(src, 2), zero);
const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(src, 4), zero);
const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(src, 6), zero);
const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
__m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
_mm_add_epi32(res_2, res_6));
......@@ -144,33 +146,37 @@ void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
HORSHEAR_REDUCE_PREC_BITS);
// Filter odd-index pixels
__m128i tmp_1 = _mm_loadu_si128(
(__m128i *)(filter + ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
__m128i tmp_3 = _mm_loadu_si128(
(__m128i *)(filter + ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
__m128i tmp_5 = _mm_loadu_si128(
(__m128i *)(filter + ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
__m128i tmp_7 = _mm_loadu_si128(
(__m128i *)(filter + ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
__m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);