Commit 2478bed5 authored by Yue Chen's avatar Yue Chen

Port SIMD optimization for obmc blending functions to av1

SIMD optimization for 1d blending functions in obmc mode, and some
code refactoring and cleanup.

(ped_1080p25.y4m, 150 frame, 2000 tb)
Encoding time overhead: +18.8% -> +18.1%
Decoding time overhead: +21.3% -> +8.7%
Change-Id: I9d856c32136e7e0e6e24ab5520ef901d7b1ee9c8
parent 863b0499
......@@ -15,6 +15,8 @@ DSP_SRCS-yes += aom_dsp_common.h
DSP_SRCS-$(HAVE_MSA) += mips/macros_msa.h
DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/synonyms.h
# bit reader
DSP_SRCS-yes += prob.h
DSP_SRCS-yes += prob.c
......@@ -87,6 +89,16 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred16_dspr2.c
DSP_SRCS-$(HAVE_DSPR2) += mips/common_dspr2.h
DSP_SRCS-$(HAVE_DSPR2) += mips/common_dspr2.c
# inter predictions
DSP_SRCS-yes += blend.h
DSP_SRCS-yes += blend_a64_mask.c
DSP_SRCS-yes += blend_a64_hmask.c
DSP_SRCS-yes += blend_a64_vmask.c
DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_sse4.h
DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_mask_sse4.c
DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_hmask_sse4.c
DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_vmask_sse4.c
# interpolation filters
DSP_SRCS-yes += aom_convolve.c
DSP_SRCS-yes += aom_convolve.h
......
......@@ -42,6 +42,22 @@ typedef int32_t tran_high_t;
typedef int16_t tran_low_t;
#endif // CONFIG_AOM_HIGHBITDEPTH
#define IMPLIES(a, b) (!(a) || (b)) // Logical 'a implies b' (or 'a -> b')
#define IS_POWER_OF_TWO(x) (((x) & ((x)-1)) == 0)
// These can be used to give a hint about branch outcomes.
// This can have an effect, even if your target processor has a
// good branch predictor, as these hints can affect basic block
// ordering by the compiler.
#ifdef __GNUC__
#define LIKELY(v) __builtin_expect(v, 1)
#define UNLIKELY(v) __builtin_expect(v, 0)
#else
#define LIKELY(v) (v)
#define UNLIKELY(v) (v)
#endif
static INLINE uint8_t clip_pixel(int val) {
return (val > 255) ? 255 : (val < 0) ? 0 : val;
}
......
......@@ -961,6 +961,25 @@ if (aom_config("CONFIG_AOM_QM") eq "yes") {
} # CONFIG_AV1_ENCODER
} # CONFIG_AOM_QM
#
# Alpha blending with mask
#
add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx";
add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w";
add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w";
specialize "aom_blend_a64_mask", qw/sse4_1/;
specialize "aom_blend_a64_hmask", qw/sse4_1/;
specialize "aom_blend_a64_vmask", qw/sse4_1/;
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto qw/void aom_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx, int bd";
add_proto qw/void aom_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w, int bd";
add_proto qw/void aom_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w, int bd";
specialize "aom_highbd_blend_a64_mask", qw/sse4_1/;
specialize "aom_highbd_blend_a64_hmask", qw/sse4_1/;
specialize "aom_highbd_blend_a64_vmask", qw/sse4_1/;
}
if (aom_config("CONFIG_ENCODERS") eq "yes") {
#
# Block subtraction
......
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_DSP_BLEND_H_
#define AOM_DSP_BLEND_H_
#include "aom_ports/mem.h"
// Various blending functions and macros.
// See also the aom_blend_* functions in aom_dsp_rtcd.h
// Alpha blending with alpha values from the range [0, 64], where 64
// means use the first input and 0 means use the second input.
#define AOM_BLEND_A64_ROUND_BITS 6
#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS) // 64
#define AOM_BLEND_A64(a, v0, v1) \
ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \
AOM_BLEND_A64_ROUND_BITS)
// Alpha blending with alpha values from the range [0, 256], where 256
// means use the first input and 0 means use the second input.
#define AOM_BLEND_A256_ROUND_BITS 8
#define AOM_BLEND_A256_MAX_ALPHA (1 << AOM_BLEND_A256_ROUND_BITS) // 256
#define AOM_BLEND_A256(a, v0, v1) \
ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A256_MAX_ALPHA - (a)) * (v1), \
AOM_BLEND_A256_ROUND_BITS)
// Blending by averaging.
#define AOM_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1)
#endif // AOM_DSP_BLEND_H_
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <assert.h>
#include "aom/aom_integer.h"
#include "aom_ports/mem.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/blend.h"
#include "./aom_dsp_rtcd.h"
void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride,
const uint8_t *src0, uint32_t src0_stride,
const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, int h, int w) {
int i, j;
assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
assert(h >= 1);
assert(w >= 1);
assert(IS_POWER_OF_TWO(h));
assert(IS_POWER_OF_TWO(w));
for (i = 0; i < h; ++i) {
for (j = 0; j < w; ++j) {
dst[i * dst_stride + j] = AOM_BLEND_A64(
mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]);
}
}
}
#if CONFIG_AOM_HIGHBITDEPTH
void aom_highbd_blend_a64_hmask_c(uint8_t *dst_8, uint32_t dst_stride,
const uint8_t *src0_8, uint32_t src0_stride,
const uint8_t *src1_8, uint32_t src1_stride,
const uint8_t *mask, int h, int w, int bd) {
int i, j;
uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
assert(h >= 1);
assert(w >= 1);
assert(IS_POWER_OF_TWO(h));
assert(IS_POWER_OF_TWO(w));
assert(bd == 8 || bd == 10 || bd == 12);
for (i = 0; i < h; ++i) {
for (j = 0; j < w; ++j) {
dst[i * dst_stride + j] = AOM_BLEND_A64(
mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]);
}
}
}
#endif // CONFIG_AOM_HIGHBITDEPTH
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <assert.h>
#include "aom/aom_integer.h"
#include "aom_ports/mem.h"
#include "aom_dsp/blend.h"
#include "aom_dsp/aom_dsp_common.h"
#include "./aom_dsp_rtcd.h"
// Blending with alpha mask. Mask values come from the range [0, 64],
// as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can
// be the same as dst, or dst can be different from both sources.
void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride,
const uint8_t *src0, uint32_t src0_stride,
const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride, int h,
int w, int subh, int subw) {
int i, j;
assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
assert(h >= 1);
assert(w >= 1);
assert(IS_POWER_OF_TWO(h));
assert(IS_POWER_OF_TWO(w));
if (subw == 0 && subh == 0) {
for (i = 0; i < h; ++i) {
for (j = 0; j < w; ++j) {
const int m = mask[i * mask_stride + j];
dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
src1[i * src1_stride + j]);
}
}
} else if (subw == 1 && subh == 1) {
for (i = 0; i < h; ++i) {
for (j = 0; j < w; ++j) {
const int m = ROUND_POWER_OF_TWO(
mask[(2 * i) * mask_stride + (2 * j)] +
mask[(2 * i + 1) * mask_stride + (2 * j)] +
mask[(2 * i) * mask_stride + (2 * j + 1)] +
mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
2);
dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
src1[i * src1_stride + j]);
}
}
} else if (subw == 1 && subh == 0) {
for (i = 0; i < h; ++i) {
for (j = 0; j < w; ++j) {
const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
mask[i * mask_stride + (2 * j + 1)]);
dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
src1[i * src1_stride + j]);
}
}
} else {
for (i = 0; i < h; ++i) {
for (j = 0; j < w; ++j) {
const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
mask[(2 * i + 1) * mask_stride + j]);
dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
src1[i * src1_stride + j]);
}
}
}
}
#if CONFIG_AOM_HIGHBITDEPTH
void aom_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride,
const uint8_t *src0_8, uint32_t src0_stride,
const uint8_t *src1_8, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w, int subh, int subw, int bd) {
int i, j;
uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
assert(h >= 1);
assert(w >= 1);
assert(IS_POWER_OF_TWO(h));
assert(IS_POWER_OF_TWO(w));
assert(bd == 8 || bd == 10 || bd == 12);
if (subw == 0 && subh == 0) {
for (i = 0; i < h; ++i) {
for (j = 0; j < w; ++j) {
const int m = mask[i * mask_stride + j];
dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
src1[i * src1_stride + j]);
}
}
} else if (subw == 1 && subh == 1) {
for (i = 0; i < h; ++i) {
for (j = 0; j < w; ++j) {
const int m = ROUND_POWER_OF_TWO(
mask[(2 * i) * mask_stride + (2 * j)] +
mask[(2 * i + 1) * mask_stride + (2 * j)] +
mask[(2 * i) * mask_stride + (2 * j + 1)] +
mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
2);
dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
src1[i * src1_stride + j]);
}
}
} else if (subw == 1 && subh == 0) {
for (i = 0; i < h; ++i) {
for (j = 0; j < w; ++j) {
const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
mask[i * mask_stride + (2 * j + 1)]);
dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
src1[i * src1_stride + j]);
}
}
} else {
for (i = 0; i < h; ++i) {
for (j = 0; j < w; ++j) {
const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
mask[(2 * i + 1) * mask_stride + j]);
dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
src1[i * src1_stride + j]);
}
}
}
}
#endif // CONFIG_AOM_HIGHBITDEPTH
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <assert.h>
#include "aom/aom_integer.h"
#include "aom_ports/mem.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/blend.h"
#include "./aom_dsp_rtcd.h"
void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride,
const uint8_t *src0, uint32_t src0_stride,
const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, int h, int w) {
int i, j;
assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
assert(h >= 1);
assert(w >= 1);
assert(IS_POWER_OF_TWO(h));
assert(IS_POWER_OF_TWO(w));
for (i = 0; i < h; ++i) {
const int m = mask[i];
for (j = 0; j < w; ++j) {
dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
src1[i * src1_stride + j]);
}
}
}
#if CONFIG_AOM_HIGHBITDEPTH
void aom_highbd_blend_a64_vmask_c(uint8_t *dst_8, uint32_t dst_stride,
const uint8_t *src0_8, uint32_t src0_stride,
const uint8_t *src1_8, uint32_t src1_stride,
const uint8_t *mask, int h, int w, int bd) {
int i, j;
uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
assert(h >= 1);
assert(w >= 1);
assert(IS_POWER_OF_TWO(h));
assert(IS_POWER_OF_TWO(w));
assert(bd == 8 || bd == 10 || bd == 12);
for (i = 0; i < h; ++i) {
const int m = mask[i];
for (j = 0; j < w; ++j) {
dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
src1[i * src1_stride + j]);
}
}
}
#endif // CONFIG_AOM_HIGHBITDEPTH
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include "aom/aom_integer.h"
#include "./aom_dsp_rtcd.h"
// To start out, just dispatch to the function using the 2D mask and
// pass mask stride as 0. This can be improved upon if necessary.
void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
const uint8_t *src0, uint32_t src0_stride,
const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, int h, int w) {
aom_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride, src1,
src1_stride, mask, 0, h, w, 0, 0);
}
#if CONFIG_AOM_HIGHBITDEPTH
void aom_highbd_blend_a64_hmask_sse4_1(
uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
const uint8_t *mask, int h, int w, int bd) {
aom_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride, src0_8, src0_stride,
src1_8, src1_stride, mask, 0, h, w, 0, 0,
bd);
}
#endif // CONFIG_AOM_HIGHBITDEPTH
This diff is collapsed.
This diff is collapsed.
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_DSP_X86_BLEND_SSE4_H_
#define AOM_DSP_X86_BLEND_SSE4_H_
#include "aom_dsp/blend.h"
#include "aom_dsp/x86/synonyms.h"
//////////////////////////////////////////////////////////////////////////////
// Common kernels
//////////////////////////////////////////////////////////////////////////////
static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1,
const __m128i v_m0_w, const __m128i v_m1_w) {
const __m128i v_s0_b = xx_loadl_32(src0);
const __m128i v_s1_b = xx_loadl_32(src1);
const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
return v_res_w;
}
static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
const __m128i v_m0_w, const __m128i v_m1_w) {
const __m128i v_s0_b = xx_loadl_64(src0);
const __m128i v_s1_b = xx_loadl_64(src1);
const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
return v_res_w;
}
#if CONFIG_AOM_HIGHBITDEPTH
typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
const __m128i v_m0_w, const __m128i v_m1_w);
static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1,
const __m128i v_m0_w, const __m128i v_m1_w) {
const __m128i v_s0_w = xx_loadl_64(src0);
const __m128i v_s1_w = xx_loadl_64(src1);
const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
return v_res_w;
}
static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1,
const __m128i v_m0_w, const __m128i v_m1_w) {
const __m128i v_s0_w = xx_loadu_128(src0);
const __m128i v_s1_w = xx_loadu_128(src1);
const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
return v_res_w;
}
static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1,
const __m128i v_m0_w, const __m128i v_m1_w) {
const __m128i v_s0_w = xx_loadl_64(src0);
const __m128i v_s1_w = xx_loadl_64(src1);
// Interleave
const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
// Multiply-Add
const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
// Scale
const __m128i v_ssum_d =
_mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1);
// Pack
const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
// Round
const __m128i v_res_w = xx_round_epu16(v_pssum_d);
return v_res_w;
}
static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
const __m128i v_m0_w, const __m128i v_m1_w) {
const __m128i v_s0_w = xx_loadu_128(src0);
const __m128i v_s1_w = xx_loadu_128(src1);
// Interleave
const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
// Multiply-Add
const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
// Scale
const __m128i v_ssuml_d =
_mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1);
const __m128i v_ssumh_d =
_mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1);
// Pack
const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
// Round
const __m128i v_res_w = xx_round_epu16(v_pssum_d);
return v_res_w;
}
#endif // CONFIG_AOM_HIGHBITDEPTH
#endif // AOM_DSP_X86_BLEND_SSE4_H_
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_DSP_X86_SYNONYMS_H_
#define AOM_DSP_X86_SYNONYMS_H_
#include <immintrin.h>
#include "./aom_config.h"
#include "aom/aom_integer.h"
/**
* Various reusable shorthands for x86 SIMD intrinsics.
*
* Intrinsics prefixed with xx_ operate on or return 128bit XMM registers.
* Intrinsics prefixed with yy_ operate on or return 256bit YMM registers.
*/
// Loads and stores to do away with the tedium of casting the address
// to the right type.
static INLINE __m128i xx_loadl_32(const void *a) {
return _mm_cvtsi32_si128(*(const uint32_t *)a);
}
static INLINE __m128i xx_loadl_64(const void *a) {