Commit aab6aee3 authored by Imdad Sardharwalla's avatar Imdad Sardharwalla Committed by Debargha Mukherjee

AVX2 implementation of the Wiener filter

Added an AVX2 version of the Wiener filter, along with associated tests. Speed
tests have been added for all implementations of the Wiener filter.

Speed Test results
==================

GCC
---

Low bit-depth filter:
- SSE2 vs C: SSE2 takes ~92% less time
- AVX2 vs C: AVX2 takes ~96% less time
- SSE2 vs AVX2: AVX2 takes ~43% less time (~74% faster)

High bit-depth filter:
- SSSE3 vs C: SSSE3 takes ~92% less time
- AVX2  vs C: AVX2  takes ~96% less time
- SSSE3 vs AVX2: AVX2 takes ~46% less time (~84% faster)

CLANG
-----

Low bit-depth filter:
- SSE2 vs C: SSE2 takes ~84% less time
- AVX2 vs C: AVX2 takes ~88% less time
- SSE2 vs AVX2: AVX2 takes ~27% less time (~36% faster)

High bit-depth filter:
- SSSE3 vs C: SSSE3 takes ~85% less time
- AVX2  vs C: AVX2  takes ~89% less time
- SSS3  vs AVX2: AVX2 takes ~24% less time (~31% faster)

Change-Id: Ide22d7c09c0be61483e9682caf17a39438e4a208
parent f7d1ff49
......@@ -440,9 +440,17 @@ if (CONFIG_LOOP_RESTORATION)
${AOM_DSP_COMMON_INTRIN_SSE2}
"${AOM_ROOT}/aom_dsp/x86/aom_convolve_hip_sse2.c")
set(AOM_DSP_COMMON_INTRIN_SSSE3
${AOM_DSP_COMMON_INTRIN_SSSE3}
"${AOM_ROOT}/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c")
set(AOM_DSP_COMMON_INTRIN_AVX2
${AOM_DSP_COMMON_INTRIN_AVX2}
"${AOM_ROOT}/aom_dsp/x86/aom_convolve_hip_avx2.c")
set(AOM_DSP_COMMON_INTRIN_SSSE3
${AOM_DSP_COMMON_INTRIN_SSSE3}
"${AOM_ROOT}/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c")
set(AOM_DSP_COMMON_INTRIN_AVX2
${AOM_DSP_COMMON_INTRIN_AVX2}
"${AOM_ROOT}/aom_dsp/x86/aom_highbd_convolve_hip_avx2.c")
endif ()
set(AOM_DSP_ENCODER_INTRIN_SSE4_1
......
......@@ -381,6 +381,7 @@ if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
specialize qw/aom_convolve8_add_src_horiz ssse3/;
specialize qw/aom_convolve8_add_src_vert ssse3/;
specialize qw/aom_convolve8_add_src_hip sse2/;
specialize qw/aom_convolve8_add_src_hip avx2/;
} # CONFIG_LOOP_RESTORATION
# TODO(any): These need to be extended to up to 128x128 block sizes
......@@ -424,7 +425,7 @@ if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
add_proto qw/void aom_highbd_convolve8_add_src_hip/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
specialize qw/aom_highbd_convolve8_add_src/, "$sse2_x86_64";
specialize qw/aom_highbd_convolve8_add_src_hip ssse3/;
specialize qw/aom_highbd_convolve8_add_src_hip ssse3 avx2/;
} # CONFIG_LOOP_RESTORATION
#
......
/*
* Copyright (c) 2018, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <immintrin.h>
#include <assert.h>
#include "./aom_dsp_rtcd.h"
#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/x86/synonyms.h"
#include "aom_dsp/x86/synonyms_avx2.h"
// 128-bit xmmwords are written as [ ... ] with the MSB on the left.
// 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB
// on the left.
// A row of, say, 8-bit pixels with values p0, p1, p2, ..., p30, p31 will be
// loaded and stored as [ p31 ... p17 p16 ][ p15 ... p1 p0 ].
void aom_convolve8_add_src_hip_avx2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
const int bd = 8;
assert(x_step_q4 == 16 && y_step_q4 == 16);
assert(!(w & 7));
(void)x_step_q4;
(void)y_step_q4;
DECLARE_ALIGNED(32, uint16_t,
temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
int intermediate_height = h + SUBPEL_TAPS - 1;
const int center_tap = ((SUBPEL_TAPS - 1) / 2);
const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
const __m128i zero_128 = _mm_setzero_si128();
const __m256i zero_256 = _mm256_setzero_si256();
// Add an offset to account for the "add_src" part of the convolve function.
const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
const __m256i clamp_low = zero_256;
const __m256i clamp_high = _mm256_set1_epi16(EXTRAPREC_CLAMP_LIMIT(bd) - 1);
/* Horizontal filter */
{
// coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset);
// coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
// coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
// coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
// coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
// coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
// coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
// coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
// coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
// coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
// coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
const __m256i round_const =
_mm256_set1_epi32((1 << (FILTER_BITS - EXTRAPREC_BITS - 1)) +
(1 << (bd + FILTER_BITS - 1)));
for (int i = 0; i < intermediate_height; ++i) {
for (int j = 0; j < w; j += 16) {
const uint8_t *data_ij = src_ptr + i * src_stride + j;
// Load 8-bit src data
const __m128i data_0 = xx_loadu_128(data_ij + 0);
const __m128i data_1 = xx_loadu_128(data_ij + 1);
const __m128i data_2 = xx_loadu_128(data_ij + 2);
const __m128i data_3 = xx_loadu_128(data_ij + 3);
const __m128i data_4 = xx_loadu_128(data_ij + 4);
const __m128i data_5 = xx_loadu_128(data_ij + 5);
const __m128i data_6 = xx_loadu_128(data_ij + 6);
const __m128i data_7 = xx_loadu_128(data_ij + 7);
// (Zero-)Extend 8-bit data to 16-bit data
const __m256i src_0 = _mm256_cvtepu8_epi16(data_0);
const __m256i src_1 = _mm256_cvtepu8_epi16(data_1);
const __m256i src_2 = _mm256_cvtepu8_epi16(data_2);
const __m256i src_3 = _mm256_cvtepu8_epi16(data_3);
const __m256i src_4 = _mm256_cvtepu8_epi16(data_4);
const __m256i src_5 = _mm256_cvtepu8_epi16(data_5);
const __m256i src_6 = _mm256_cvtepu8_epi16(data_6);
const __m256i src_7 = _mm256_cvtepu8_epi16(data_7);
// Multiply src data by filter coeffs and sum pairs
const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
// Calculate scalar product for even- and odd-indices separately,
// increasing to 32-bit precision
const __m256i res_even_sum = _mm256_add_epi32(
_mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6));
const __m256i res_odd_sum = _mm256_add_epi32(
_mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7));
const __m256i res_even =
_mm256_srai_epi32(_mm256_add_epi32(res_even_sum, round_const),
FILTER_BITS - EXTRAPREC_BITS);
const __m256i res_odd =
_mm256_srai_epi32(_mm256_add_epi32(res_odd_sum, round_const),
FILTER_BITS - EXTRAPREC_BITS);
// Reduce to 16-bit precision and pack even- and odd-index results
// back into one register. The _mm256_packs_epi32 intrinsic returns
// a register with the pixels ordered as follows:
// [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
const __m256i res = _mm256_packs_epi32(res_even, res_odd);
const __m256i res_clamped =
_mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high);
// Store in a temporary array
yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped);
}
}
}
/* Vertical filter */
{
// coeffs [ g7 g6 g5 g4 g3 g2 g1 g0 ]
const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset);
// coeffs [ g3 g2 g3 g2 g1 g0 g1 g0 ]
const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
// coeffs [ g7 g6 g7 g6 g5 g4 g5 g4 ]
const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
// coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ]
const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
// coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ]
const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
// coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ]
const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
// coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ]
const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
// coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ][ g1 g0 g1 g0 g1 g0 g1 g0 ]
const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
// coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ][ g3 g2 g3 g2 g3 g2 g3 g2 ]
const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
// coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ][ g5 g4 g5 g4 g5 g4 g5 g4 ]
const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
// coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ][ g7 g6 g7 g6 g7 g6 g7 g6 ]
const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
const __m256i round_const =
_mm256_set1_epi32((1 << (FILTER_BITS + EXTRAPREC_BITS - 1)) -
(1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1)));
for (int i = 0; i < h; ++i) {
for (int j = 0; j < w; j += 16) {
const uint16_t *data_ij = temp + i * MAX_SB_SIZE + j;
// Load 16-bit data from the output of the horizontal filter in
// which the pixels are ordered as follows:
// [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
const __m256i data_0 = yy_loadu_256(data_ij + 0 * MAX_SB_SIZE);
const __m256i data_1 = yy_loadu_256(data_ij + 1 * MAX_SB_SIZE);
const __m256i data_2 = yy_loadu_256(data_ij + 2 * MAX_SB_SIZE);
const __m256i data_3 = yy_loadu_256(data_ij + 3 * MAX_SB_SIZE);
const __m256i data_4 = yy_loadu_256(data_ij + 4 * MAX_SB_SIZE);
const __m256i data_5 = yy_loadu_256(data_ij + 5 * MAX_SB_SIZE);
const __m256i data_6 = yy_loadu_256(data_ij + 6 * MAX_SB_SIZE);
const __m256i data_7 = yy_loadu_256(data_ij + 7 * MAX_SB_SIZE);
// Filter the even-indices, increasing to 32-bit precision
const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1);
const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3);
const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5);
const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7);
const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
const __m256i res_even = _mm256_add_epi32(
_mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
// Filter the odd-indices, increasing to 32-bit precision
const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1);
const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3);
const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5);
const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7);
const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
const __m256i res_odd = _mm256_add_epi32(
_mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
// Pixels are currently in the following order:
// res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ]
// res_odd order: [ 15 13 11 9 ] [ 7 5 3 1 ]
//
// Rearrange the pixels into the following order:
// res_lo order: [ 11 10 9 8 ] [ 3 2 1 0 ]
// res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ]
const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
const __m256i res_lo_round =
_mm256_srai_epi32(_mm256_add_epi32(res_lo, round_const),
FILTER_BITS + EXTRAPREC_BITS);
const __m256i res_hi_round =
_mm256_srai_epi32(_mm256_add_epi32(res_hi, round_const),
FILTER_BITS + EXTRAPREC_BITS);
// Reduce to 16-bit precision and pack into the correct order:
// [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ]
const __m256i res_16bit =
_mm256_packs_epi32(res_lo_round, res_hi_round);
// Reduce to 8-bit precision. This messes up the order:
// [ - - - - - - - - 15 14 13 12 11 10 9 8 ]
// [ - - - - - - - - 7 6 5 4 3 2 1 0 ]
const __m256i res_8bit =
_mm256_packus_epi16(res_16bit, zero_256 /* don't care value */);
// Swap the two central 32-bit values to get the order:
// [ - - - - - - - - - - - - - - - - ]
// [ 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 ]
const __m256i res_8bit2 = _mm256_permute4x64_epi64(res_8bit, 0xd8);
// Store the lower 128-bit lane in the dst array
xx_storeu_128(dst + i * dst_stride + j,
_mm256_castsi256_si128(res_8bit2));
}
}
}
}
/*
* Copyright (c) 2018, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <immintrin.h>
#include <assert.h>
#include "./aom_dsp_rtcd.h"
#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/x86/synonyms.h"
#include "aom_dsp/x86/synonyms_avx2.h"
#if EXTRAPREC_BITS > 2
#error "Highbd high-prec convolve filter only supports EXTRAPREC_BITS <= 2"
#error "(need to use 32-bit intermediates for EXTRAPREC_BITS > 2)"
#endif
// 128-bit xmmwords are written as [ ... ] with the MSB on the left.
// 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB
// on the left.
// A row of, say, 16-bit pixels with values p0, p1, p2, ..., p14, p15 will be
// loaded and stored as [ p15 ... p9 p8 ][ p7 ... p1 p0 ].
void aom_highbd_convolve8_add_src_hip_avx2(
const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {
assert(x_step_q4 == 16 && y_step_q4 == 16);
assert(!(w & 7));
(void)x_step_q4;
(void)y_step_q4;
const uint16_t *const src = CONVERT_TO_SHORTPTR(src8);
uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8);
DECLARE_ALIGNED(32, uint16_t,
temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
int intermediate_height = h + SUBPEL_TAPS - 1;
const int center_tap = ((SUBPEL_TAPS - 1) / 2);
const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap;
const __m128i zero_128 = _mm_setzero_si128();
const __m256i zero_256 = _mm256_setzero_si256();
// Add an offset to account for the "add_src" part of the convolve function.
const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
const __m256i clamp_low = zero_256;
/* Horizontal filter */
{
const __m256i clamp_high_ep =
_mm256_set1_epi16(EXTRAPREC_CLAMP_LIMIT(bd) - 1);
// coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset);
// coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
// coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
// coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
// coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
// coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
// coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
// coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
// coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
// coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
// coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
const __m256i round_const =
_mm256_set1_epi32((1 << (FILTER_BITS - EXTRAPREC_BITS - 1)) +
(1 << (bd + FILTER_BITS - 1)));
for (int i = 0; i < intermediate_height; ++i) {
for (int j = 0; j < w; j += 16) {
const uint16_t *src_ij = src_ptr + i * src_stride + j;
// Load 16-bit src data
const __m256i src_0 = yy_loadu_256(src_ij + 0);
const __m256i src_1 = yy_loadu_256(src_ij + 1);
const __m256i src_2 = yy_loadu_256(src_ij + 2);
const __m256i src_3 = yy_loadu_256(src_ij + 3);
const __m256i src_4 = yy_loadu_256(src_ij + 4);
const __m256i src_5 = yy_loadu_256(src_ij + 5);
const __m256i src_6 = yy_loadu_256(src_ij + 6);
const __m256i src_7 = yy_loadu_256(src_ij + 7);
// Multiply src data by filter coeffs and sum pairs
const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
// Calculate scalar product for even- and odd-indices separately,
// increasing to 32-bit precision
const __m256i res_even_sum = _mm256_add_epi32(
_mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6));
const __m256i res_even =
_mm256_srai_epi32(_mm256_add_epi32(res_even_sum, round_const),
FILTER_BITS - EXTRAPREC_BITS);
const __m256i res_odd_sum = _mm256_add_epi32(
_mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7));
const __m256i res_odd =
_mm256_srai_epi32(_mm256_add_epi32(res_odd_sum, round_const),
FILTER_BITS - EXTRAPREC_BITS);
// Reduce to 16-bit precision and pack even- and odd-index results
// back into one register. The _mm256_packs_epi32 intrinsic returns
// a register with the pixels ordered as follows:
// [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
const __m256i res = _mm256_packs_epi32(res_even, res_odd);
const __m256i res_clamped =
_mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high_ep);
// Store in a temporary array
yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped);
}
}
}
/* Vertical filter */
{
const __m256i clamp_high = _mm256_set1_epi16((1 << bd) - 1);
// coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset);
// coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
// coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
// coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
// coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
// coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
// coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
// coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
// coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
// coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
// coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
const __m256i round_const =
_mm256_set1_epi32((1 << (FILTER_BITS + EXTRAPREC_BITS - 1)) -
(1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1)));
for (int i = 0; i < h; ++i) {
for (int j = 0; j < w; j += 16) {
const uint16_t *temp_ij = temp + i * MAX_SB_SIZE + j;
// Load 16-bit data from the output of the horizontal filter in
// which the pixels are ordered as follows:
// [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
const __m256i data_0 = yy_loadu_256(temp_ij + 0 * MAX_SB_SIZE);
const __m256i data_1 = yy_loadu_256(temp_ij + 1 * MAX_SB_SIZE);
const __m256i data_2 = yy_loadu_256(temp_ij + 2 * MAX_SB_SIZE);
const __m256i data_3 = yy_loadu_256(temp_ij + 3 * MAX_SB_SIZE);
const __m256i data_4 = yy_loadu_256(temp_ij + 4 * MAX_SB_SIZE);
const __m256i data_5 = yy_loadu_256(temp_ij + 5 * MAX_SB_SIZE);
const __m256i data_6 = yy_loadu_256(temp_ij + 6 * MAX_SB_SIZE);
const __m256i data_7 = yy_loadu_256(temp_ij + 7 * MAX_SB_SIZE);
// Filter the even-indices, increasing to 32-bit precision
const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1);
const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3);
const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5);
const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7);
const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
const __m256i res_even = _mm256_add_epi32(
_mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
// Filter the odd-indices, increasing to 32-bit precision
const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1);
const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3);
const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5);
const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7);
const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
const __m256i res_odd = _mm256_add_epi32(
_mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
// Pixels are currently in the following order:
// res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ]
// res_odd order: [ 15 13 11 9 ] [ 7 5 3 1 ]
//
// Rearrange the pixels into the following order:
// res_lo order: [ 11 10 9 8 ] [ 3 2 1 0 ]
// res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ]
const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
const __m256i res_lo_round =
_mm256_srai_epi32(_mm256_add_epi32(res_lo, round_const),
FILTER_BITS + EXTRAPREC_BITS);
const __m256i res_hi_round =
_mm256_srai_epi32(_mm256_add_epi32(res_hi, round_const),
FILTER_BITS + EXTRAPREC_BITS);
// Reduce to 16-bit precision and pack into the correct order:
// [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ]
const __m256i res_16bit =
_mm256_packs_epi32(res_lo_round, res_hi_round);
const __m256i res_16bit_clamped = _mm256_min_epi16(
_mm256_max_epi16(res_16bit, clamp_low), clamp_high);
// Store in the dst array
yy_storeu_256(dst + i * dst_stride + j, res_16bit_clamped);
}
}
}
}
......@@ -42,4 +42,11 @@ static INLINE void yy_storeu_256(void *const a, const __m256i v) {
_mm256_storeu_si256((__m256i *)a, v);
}
// Some compilers don't have _mm256_set_m128i defined in immintrin.h. We
// therefore define an equivalent function using a different intrinsic.
// ([ hi ], [ lo ]) -> [ hi ][ lo ]
static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) {
return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
}
#endif // AOM_DSP_X86_SYNONYMS_AVX2_H_
......@@ -20,23 +20,40 @@ using std::tr1::tuple;
namespace {
#if HAVE_SSE2
#if HAVE_SSE2 || HAVE_AVX2
TEST_P(AV1HiprecConvolveTest, CheckOutput) { RunCheckOutput(GET_PARAM(3)); }
TEST_P(AV1HiprecConvolveTest, DISABLED_SpeedTest) {
RunSpeedTest(GET_PARAM(3));
}
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(SSE2, AV1HiprecConvolveTest,
libaom_test::AV1HiprecConvolve::BuildParams(
aom_convolve8_add_src_hip_sse2));
#endif
#if HAVE_AVX2
INSTANTIATE_TEST_CASE_P(AVX2, AV1HiprecConvolveTest,
libaom_test::AV1HiprecConvolve::BuildParams(
aom_convolve8_add_src_hip_avx2));
#endif
#endif
#if HAVE_SSSE3
#if HAVE_SSSE3 || HAVE_AVX2
TEST_P(AV1HighbdHiprecConvolveTest, CheckOutput) {
RunCheckOutput(GET_PARAM(4));
}
TEST_P(AV1HighbdHiprecConvolveTest, DISABLED_SpeedTest) {
RunSpeedTest(GET_PARAM(4));
}
#if HAVE_SSSE3
INSTANTIATE_TEST_CASE_P(SSSE3, AV1HighbdHiprecConvolveTest,
libaom_test::AV1HighbdHiprecConvolve::BuildParams(
aom_highbd_convolve8_add_src_hip_ssse3));
#endif
#if HAVE_AVX2
INSTANTIATE_TEST_CASE_P(AVX2, AV1HighbdHiprecConvolveTest,
libaom_test::AV1HighbdHiprecConvolve::BuildParams(
aom_highbd_convolve8_add_src_hip_avx2));
#endif
#endif
} // namespace
......@@ -74,10 +74,10 @@ void AV1HiprecConvolveTest::RunCheckOutput(hiprec_convolve_func test_impl) {
uint8_t *input_ = new uint8_t[h * w];
uint8_t *input = input_;
// The convolve functions always write rows with widths that are multiples of
// 8.
// So to avoid a buffer overflow, we may need to pad rows to a multiple of 8.
int output_n = ((out_w + 7) & ~7) * out_h;
// The AVX2 convolve functions always write rows with widths that are
// multiples of 16. So to avoid a buffer overflow, we may need to pad
// rows to a multiple of 16.
int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
uint8_t *output = new uint8_t[output_n];
uint8_t *output2 = new uint8_t[output_n];
......@@ -108,6 +108,70 @@ void AV1HiprecConvolveTest::RunCheckOutput(hiprec_convolve_func test_impl) {
delete[] output;
delete[] output2;
}
void AV1HiprecConvolveTest::RunSpeedTest(hiprec_convolve_func test_impl) {
const int w = 128, h = 128;
const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
const int num_iters = GET_PARAM(2) / 500;
int i, j, k;
uint8_t *input_ = new uint8_t[h * w];
uint8_t *input = input_;
// The AVX2 convolve functions always write rows with widths that are
// multiples of 16. So to avoid a buffer overflow, we may need to pad
// rows to a multiple of 16.
int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
uint8_t *output = new uint8_t[output_n];
uint8_t *output2 = new uint8_t[output_n];
// Generate random filter kernels
DECLARE_ALIGNED(16, InterpKernel, hkernel);
DECLARE_ALIGNED(16, InterpKernel, vkernel);
generate_kernels(&rnd_, hkernel, vkernel);
for (i = 0; i < h; ++i)
for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
aom_usec_timer ref_timer;