Commit ef34fff7 authored by Cheng Chen's avatar Cheng Chen Committed by Jingning Han

JNT_COMP: add SIMD implementations for c functions

Add SIMD implementations for c functions for low bit-depth, making
encoder speed faster by 3~4x than c functions.

Change-Id: Icca0b07b25489759be9504aaec09d1239076fc52
parent f78632e0
......@@ -59,7 +59,8 @@ set(AOM_DSP_COMMON_ASM_SSSE3
set(AOM_DSP_COMMON_INTRIN_SSSE3
"${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c"
"${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.c"
"${AOM_ROOT}/aom_dsp/x86/inv_txfm_ssse3.c")
"${AOM_ROOT}/aom_dsp/x86/inv_txfm_ssse3.c"
"${AOM_ROOT}/aom_dsp/x86/variance_ssse3.c")
set(AOM_DSP_COMMON_INTRIN_SSE4_1
"${AOM_ROOT}/aom_dsp/x86/blend_a64_hmask_sse4.c"
......
......@@ -107,6 +107,7 @@ DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_sse4.h
DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_mask_sse4.c
DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_hmask_sse4.c
DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_vmask_sse4.c
DSP_SRCS-$(HAVE_SSSE3) += x86/variance_ssse3.c
# interpolation filters
DSP_SRCS-yes += aom_convolve.c
......
......@@ -1110,6 +1110,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
if (aom_config("CONFIG_JNT_COMP") eq "yes") {
add_proto qw/void aom_jnt_comp_avg_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref, int ref_stride, const JNT_COMP_PARAMS *jcp_param";
specialize qw/aom_jnt_comp_avg_upsampled_pred ssse3/;
}
if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
......@@ -1339,6 +1340,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/void aom_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
if (aom_config("CONFIG_JNT_COMP") eq "yes") {
add_proto qw/void aom_jnt_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const JNT_COMP_PARAMS *jcp_param";
specialize qw/aom_jnt_comp_avg_pred ssse3/;
}
if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
add_proto qw/unsigned int aom_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
......
......@@ -214,7 +214,7 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
\
aom_jnt_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \
\
return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \
return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \
}
#else // CONFIG_JNT_COMP
#define SUBPIX_AVG_VAR(W, H) \
......@@ -397,13 +397,11 @@ void aom_jnt_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
int i, j;
const int fwd_offset = jcp_param->fwd_offset;
const int bck_offset = jcp_param->bck_offset;
double sum = bck_offset + fwd_offset;
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
tmp = (int)(0.5 + tmp / sum);
if (tmp > 255) tmp = 255;
tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
comp_pred[j] = (uint8_t)tmp;
}
comp_pred += width;
......@@ -420,7 +418,6 @@ void aom_jnt_comp_avg_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred,
int i, j;
const int fwd_offset = jcp_param->fwd_offset;
const int bck_offset = jcp_param->bck_offset;
double sum = bck_offset + fwd_offset;
aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref,
ref_stride);
......@@ -428,8 +425,7 @@ void aom_jnt_comp_avg_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred,
for (i = 0; i < height; i++) {
for (j = 0; j < width; j++) {
int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
tmp = (int)(0.5 + tmp / sum);
if (tmp > 255) tmp = 255;
tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
comp_pred[j] = (uint8_t)tmp;
}
comp_pred += width;
......
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <assert.h>
#include <emmintrin.h> // SSE2
#include <tmmintrin.h>
#include "./aom_config.h"
#include "./aom_dsp_rtcd.h"
#include "aom_dsp/x86/synonyms.h"
#include "./av1_rtcd.h"
#if CONFIG_JNT_COMP
static void compute_jnt_comp_avg(__m128i *p0, __m128i *p1, const __m128i *w,
const __m128i *r, void *const result) {
__m128i p_lo = _mm_unpacklo_epi8(*p0, *p1);
__m128i mult_lo = _mm_maddubs_epi16(p_lo, *w);
__m128i round_lo = _mm_add_epi16(mult_lo, *r);
__m128i shift_lo = _mm_srai_epi16(round_lo, DIST_PRECISION_BITS);
__m128i p_hi = _mm_unpackhi_epi8(*p0, *p1);
__m128i mult_hi = _mm_maddubs_epi16(p_hi, *w);
__m128i round_hi = _mm_add_epi16(mult_hi, *r);
__m128i shift_hi = _mm_srai_epi16(round_hi, DIST_PRECISION_BITS);
xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi));
}
void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
int width, int height, const uint8_t *ref,
int ref_stride,
const JNT_COMP_PARAMS *jcp_param) {
int i;
const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
w1, w0, w1, w0);
const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
const __m128i r =
_mm_set_epi16(round, round, round, round, round, round, round, round);
if (width >= 16) {
// Read 16 pixels one row at a time
assert(!(width & 15));
for (i = 0; i < height; ++i) {
int j;
for (j = 0; j < width; j += 16) {
__m128i p0 = xx_loadu_128(ref);
__m128i p1 = xx_loadu_128(pred);
compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
comp_pred += 16;
pred += 16;
ref += 16;
}
ref += ref_stride - width;
}
} else if (width >= 8) {
// Read 8 pixels two row at a time
assert(!(width & 7));
assert(!(width & 1));
for (i = 0; i < height; i += 2) {
__m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride);
__m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride);
__m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1);
__m128i p1 = xx_loadu_128(pred);
compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
comp_pred += 16;
pred += 16;
ref += 2 * ref_stride;
}
} else {
// Read 4 pixels four row at a time
assert(!(width & 3));
assert(!(height & 3));
for (i = 0; i < height; i += 4) {
__m128i p0_0 = xx_loadl_32(ref + 0 * ref_stride);
__m128i p0_1 = xx_loadl_32(ref + 1 * ref_stride);
__m128i p0_2 = xx_loadl_32(ref + 2 * ref_stride);
__m128i p0_3 = xx_loadl_32(ref + 3 * ref_stride);
__m128i p0 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(p0_0, p0_1),
_mm_unpacklo_epi32(p0_2, p0_3));
__m128i p1 = xx_loadu_128(pred);
compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
comp_pred += 16;
pred += 16;
ref += 4 * ref_stride;
}
}
}
void aom_jnt_comp_avg_upsampled_pred_ssse3(uint8_t *comp_pred,
const uint8_t *pred, int width,
int height, int subpel_x_q3,
int subpel_y_q3, const uint8_t *ref,
int ref_stride,
const JNT_COMP_PARAMS *jcp_param) {
int n;
int i;
aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref,
ref_stride);
/*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
assert(!(width * height & 15));
n = width * height >> 4;
const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
w1, w0, w1, w0);
const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
const __m128i r =
_mm_set_epi16(round, round, round, round, round, round, round, round);
for (i = 0; i < n; i++) {
__m128i p0 = xx_loadu_128(comp_pred);
__m128i p1 = xx_loadu_128(pred);
compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
comp_pred += 16;
pred += 16;
}
}
#endif // CONFIG_JNT_COMP
......@@ -288,6 +288,9 @@ if (CONFIG_CONVOLVE_ROUND)
set(AOM_AV1_COMMON_INTRIN_SSE2
${AOM_AV1_COMMON_INTRIN_SSE2}
"${AOM_ROOT}/av1/common/x86/convolve_2d_sse2.c")
set(AOM_AV1_COMMON_INTRIN_SSE4_1
${AOM_AV1_COMMON_INTRIN_SSE4_1}
"${AOM_ROOT}/av1/common/x86/convolve_2d_sse4.c")
if (CONFIG_HIGHBITDEPTH)
set(AOM_AV1_COMMON_INTRIN_SSSE3
${AOM_AV1_COMMON_INTRIN_SSSE3}
......
......@@ -155,6 +155,7 @@ endif
ifeq ($(CONFIG_CONVOLVE_ROUND),yes)
AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/convolve_2d_sse2.c
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/convolve_2d_sse4.c
ifeq ($(CONFIG_HIGHBITDEPTH),yes)
AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/highbd_convolve_2d_ssse3.c
endif
......
......@@ -557,6 +557,11 @@ if (aom_config("CONFIG_CONVOLVE_ROUND") eq "yes") {
specialize qw/av1_convolve_2d_scale sse4_1/;
}
if (aom_config("CONFIG_JNT_COMP") eq "yes") {
add_proto qw/void av1_jnt_convolve_2d/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
specialize qw/av1_jnt_convolve_2d sse4_1/;
}
if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
add_proto qw/void av1_highbd_convolve_2d/, "const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
specialize qw/av1_highbd_convolve_2d ssse3/;
......
......@@ -417,7 +417,54 @@ void av1_convolve_2d_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst,
sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
}
CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
if (conv_params->do_average)
dst[y * dst_stride + x] += res;
else
dst[y * dst_stride + x] = res;
}
}
}
#if CONFIG_JNT_COMP
void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride,
CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
int x, y, k;
uint8_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
int im_h = h + filter_params_y->taps - 1;
int im_stride = w;
const int fo_vert = filter_params_y->taps / 2 - 1;
const int fo_horiz = filter_params_x->taps / 2 - 1;
// horizontal filter
const uint8_t *src_horiz = src - fo_vert * src_stride;
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
for (y = 0; y < im_h; ++y) {
for (x = 0; x < w; ++x) {
int32_t sum = 0;
for (k = 0; k < filter_params_x->taps; ++k) {
sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
}
im_block[y * im_stride + x] =
clip_pixel(ROUND_POWER_OF_TWO(sum, conv_params->round_0));
}
}
// vertical filter
uint8_t *src_vert = im_block + fo_vert * im_stride;
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
for (y = 0; y < h; ++y) {
for (x = 0; x < w; ++x) {
CONV_BUF_TYPE sum = 0;
for (k = 0; k < filter_params_y->taps; ++k) {
sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
}
CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
if (conv_params->bck_offset == -1) {
if (conv_params->do_average)
dst[y * dst_stride + x] += res;
......@@ -432,15 +479,10 @@ void av1_convolve_2d_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst,
dst[y * dst_stride + x] >>= (DIST_PRECISION_BITS - 1);
}
}
#else
if (conv_params->do_average)
dst[y * dst_stride + x] += res;
else
dst[y * dst_stride + x] = res;
#endif // CONFIG_JNT_COMP
}
}
}
#endif // CONFIG_JNT_COMP
void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride,
CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
......@@ -571,7 +613,60 @@ void av1_convolve_2d_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst,
CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
((1 << (offset_bits - conv_params->round_1)) +
(1 << (offset_bits - conv_params->round_1 - 1)));
if (conv_params->do_average)
dst[y * dst_stride + x] += res;
else
dst[y * dst_stride + x] = res;
}
}
}
#if CONFIG_JNT_COMP
void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride,
CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
int x, y, k;
int32_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
int im_h = h + filter_params_y->taps - 1;
int im_stride = w;
const int fo_vert = filter_params_y->taps / 2 - 1;
const int fo_horiz = filter_params_x->taps / 2 - 1;
const int bd = 8;
// horizontal filter
const uint8_t *src_horiz = src - fo_vert * src_stride;
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
for (y = 0; y < im_h; ++y) {
for (x = 0; x < w; ++x) {
int32_t sum = (1 << (bd + FILTER_BITS - 1));
for (k = 0; k < filter_params_x->taps; ++k) {
sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
}
assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
im_block[y * im_stride + x] =
ROUND_POWER_OF_TWO(sum, conv_params->round_0);
}
}
// vertical filter
int32_t *src_vert = im_block + fo_vert * im_stride;
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
for (y = 0; y < h; ++y) {
for (x = 0; x < w; ++x) {
CONV_BUF_TYPE sum = 1 << offset_bits;
for (k = 0; k < filter_params_y->taps; ++k) {
sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
}
assert(0 <= sum && sum < (1 << (offset_bits + 2)));
CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
((1 << (offset_bits - conv_params->round_1)) +
(1 << (offset_bits - conv_params->round_1 - 1)));
if (conv_params->fwd_offset == -1) {
if (conv_params->do_average)
dst[y * dst_stride + x] += res;
......@@ -586,15 +681,10 @@ void av1_convolve_2d_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst,
dst[y * dst_stride + x] = res * conv_params->fwd_offset;
}
}
#else
if (conv_params->do_average)
dst[y * dst_stride + x] += res;
else
dst[y * dst_stride + x] = res;
#endif // CONFIG_JNT_COMP
}
}
}
#endif // CONFIG_JNT_COMP
void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride,
CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
......@@ -716,15 +806,15 @@ void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
// horizontal and vertical parameters are swapped because of the transpose
#if CONFIG_JNT_COMP
if (scaled)
av1_convolve_2d_scale_c(tr_src + fo_horiz * tr_src_stride + fo_vert,
tr_src_stride, tr_dst, tr_dst_stride, h, w,
&filter_params_y, &filter_params_x, subpel_y_q4,
y_step_q4, subpel_x_q4, x_step_q4, conv_params);
av1_convolve_2d_scale(tr_src + fo_horiz * tr_src_stride + fo_vert,
tr_src_stride, tr_dst, tr_dst_stride, h, w,
&filter_params_y, &filter_params_x, subpel_y_q4,
y_step_q4, subpel_x_q4, x_step_q4, conv_params);
else
av1_convolve_2d_c(tr_src + fo_horiz * tr_src_stride + fo_vert,
tr_src_stride, tr_dst, tr_dst_stride, h, w,
&filter_params_y, &filter_params_x, subpel_y_q4,
subpel_x_q4, conv_params);
av1_jnt_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert,
tr_src_stride, tr_dst, tr_dst_stride, h, w,
&filter_params_y, &filter_params_x, subpel_y_q4,
subpel_x_q4, conv_params);
#else
if (scaled)
av1_convolve_2d_scale(tr_src + fo_horiz * tr_src_stride + fo_vert,
......@@ -742,15 +832,15 @@ void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
} else {
#if CONFIG_JNT_COMP
if (scaled)
av1_convolve_2d_scale_c(src, src_stride, conv_params->dst,
conv_params->dst_stride, w, h, &filter_params_x,
&filter_params_y, subpel_x_q4, x_step_q4,
subpel_y_q4, y_step_q4, conv_params);
av1_convolve_2d_scale(src, src_stride, conv_params->dst,
conv_params->dst_stride, w, h, &filter_params_x,
&filter_params_y, subpel_x_q4, x_step_q4,
subpel_y_q4, y_step_q4, conv_params);
else
av1_convolve_2d_c(src, src_stride, conv_params->dst,
conv_params->dst_stride, w, h, &filter_params_x,
&filter_params_y, subpel_x_q4, subpel_y_q4,
conv_params);
av1_jnt_convolve_2d(src, src_stride, conv_params->dst,
conv_params->dst_stride, w, h, &filter_params_x,
&filter_params_y, subpel_x_q4, subpel_y_q4,
conv_params);
#else
if (scaled)
av1_convolve_2d_scale(src, src_stride, conv_params->dst,
......
......@@ -260,6 +260,12 @@ static void vfilter(const int32_t *src, int src_stride, int32_t *dst,
(1 << (offset_bits - conv_params->round_1 - 1)));
const __m128i sub = _mm_set1_epi32(sub32);
#if CONFIG_JNT_COMP
const __m128i fwd_offset = _mm_set1_epi32(conv_params->fwd_offset);
const __m128i bck_offset = _mm_set1_epi32(conv_params->bck_offset);
const __m128i jnt_round = _mm_set1_epi32(1 << (DIST_PRECISION_BITS - 2));
#endif // CONFIG_JNT_COMP
int y_qn = subpel_y_qn;
for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
const int32_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
......@@ -305,10 +311,29 @@ static void vfilter(const int32_t *src, int src_stride, int32_t *dst,
const __m128i subbed = _mm_sub_epi32(shifted, sub);
int32_t *dst_x = dst + y * dst_stride + x;
#if CONFIG_JNT_COMP
__m128i result;
if (conv_params->fwd_offset != -1 && conv_params->bck_offset != -1) {
if (conv_params->do_average) {
result = _mm_srai_epi32(
_mm_add_epi32(_mm_add_epi32(_mm_loadu_si128((__m128i *)dst_x),
_mm_mullo_epi32(subbed, bck_offset)),
jnt_round),
DIST_PRECISION_BITS - 1);
} else {
result = _mm_mullo_epi32(subbed, fwd_offset);
}
} else {
result = (conv_params->do_average)
? _mm_add_epi32(subbed, _mm_loadu_si128((__m128i *)dst_x))
: subbed;
}
#else
const __m128i result =
(conv_params->do_average)
? _mm_add_epi32(subbed, _mm_loadu_si128((__m128i *)dst_x))
: subbed;
#endif // CONFIG_JNT_COMP
_mm_storeu_si128((__m128i *)dst_x, result);
}
......@@ -317,10 +342,24 @@ static void vfilter(const int32_t *src, int src_stride, int32_t *dst,
CONV_BUF_TYPE sum = 1 << offset_bits;
for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - sub32;
if (conv_params->do_average)
dst[y * dst_stride + x] += res;
else
dst[y * dst_stride + x] = res;
#if CONFIG_JNT_COMP
if (conv_params->fwd_offset != -1 && conv_params->bck_offset != -1) {
if (conv_params->do_average) {
dst[y * dst_stride + x] += res * conv_params->bck_offset;
dst[y * dst_stride + x] >>= (DIST_PRECISION_BITS - 1);
} else {
dst[y * dst_stride + x] = res * conv_params->fwd_offset;
}
} else {
#endif // CONFIG_JNT_COMP
if (conv_params->do_average)
dst[y * dst_stride + x] += res;
else
dst[y * dst_stride + x] = res;
#if CONFIG_JNT_COMP
}
#endif // CONFIG_JNT_COMP
}
}
}
......@@ -342,6 +381,12 @@ static void vfilter8(const int32_t *src, int src_stride, int32_t *dst,
(1 << (offset_bits - conv_params->round_1 - 1)));
const __m128i sub = _mm_set1_epi32(sub32);
#if CONFIG_JNT_COMP
const __m128i fwd_offset = _mm_set1_epi32(conv_params->fwd_offset);
const __m128i bck_offset = _mm_set1_epi32(conv_params->bck_offset);
const __m128i jnt_round = _mm_set1_epi32(1 << (DIST_PRECISION_BITS - 2));
#endif // CONFIG_JNT_COMP
int y_qn = subpel_y_qn;
for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
const int32_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
......@@ -384,10 +429,29 @@ static void vfilter8(const int32_t *src, int src_stride, int32_t *dst,
const __m128i subbed = _mm_sub_epi32(shifted, sub);
int32_t *dst_x = dst + y * dst_stride + x;
#if CONFIG_JNT_COMP
__m128i result;
if (conv_params->fwd_offset != -1 && conv_params->bck_offset != -1) {
if (conv_params->do_average) {
result = _mm_srai_epi32(
_mm_add_epi32(_mm_add_epi32(_mm_loadu_si128((__m128i *)dst_x),
_mm_mullo_epi32(subbed, bck_offset)),
jnt_round),
DIST_PRECISION_BITS - 1);
} else {
result = _mm_mullo_epi32(subbed, fwd_offset);
}
} else {
result = (conv_params->do_average)
? _mm_add_epi32(subbed, _mm_loadu_si128((__m128i *)dst_x))
: subbed;
}
#else
const __m128i result =
(conv_params->do_average)
? _mm_add_epi32(subbed, _mm_loadu_si128((__m128i *)dst_x))
: subbed;
#endif // CONFIG_JNT_COMP
_mm_storeu_si128((__m128i *)dst_x, result);
}
......@@ -396,10 +460,24 @@ static void vfilter8(const int32_t *src, int src_stride, int32_t *dst,
CONV_BUF_TYPE sum = 1 << offset_bits;
for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - sub32;
if (conv_params->do_average)
dst[y * dst_stride + x] += res;
else
dst[y * dst_stride + x] = res;
#if CONFIG_JNT_COMP
if (conv_params->fwd_offset != -1 && conv_params->bck_offset != -1) {
if (conv_params->do_average) {
dst[y * dst_stride + x] += res * conv_params->bck_offset;
dst[y * dst_stride + x] >>= (DIST_PRECISION_BITS - 1);
} else {
dst[y * dst_stride + x] = res * conv_params->fwd_offset;
}
} else {
#endif // CONFIG_JNT_COMP
if (conv_params->do_average)
dst[y * dst_stride + x] += res;
else
dst[y * dst_stride + x] = res;
#if CONFIG_JNT_COMP
}
#endif // CONFIG_JNT_COMP
}
}
}
......
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <emmintrin.h>
#include <smmintrin.h>
#include "./aom_dsp_rtcd.h"
#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
#include "av1/common/convolve.h"
#if CONFIG_JNT_COMP
#if CONFIG_COMPOUND_ROUND
void av1_jnt_convolve_2d_sse4_1(const uint8_t *src, int src_stride,
CONV_BUF_TYPE *dst, int dst_stride, int w,
int h, InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
DECLARE_ALIGNED(16, uint8_t,
im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
int im_h = h + filter_params_y->taps - 1;
int im_stride = MAX_SB_SIZE;