Commit 89d321f7 authored by Joe Young's avatar Joe Young Committed by Joe Young

[intra-edge] Vectorize edge filtering functions

Add sse4_1 functions for Intra-edge experiment:
  av1_filter_intra_edge_sse4_1()
  av1_filter_intra_edge_high_sse4_1()

Approx cycle reduction at qp 20, 1 kf:
  Enc (lbd) 1.4% to 0.3%
  Dec (lbd) 0.4% to 0.1%
  Enc (hbd) 1.1% to 0.2%
  Dec (hbd) 0.6% to 0.1%

No change to bitstream

Change-Id: I176b2d125424d7d226114c807915c33dde5c3720
parent e80957ff
......@@ -412,6 +412,12 @@ if (CONFIG_LOOP_RESTORATION)
"${AOM_ROOT}/av1/encoder/pickrst.h")
endif ()
if (CONFIG_INTRA_EDGE)
set(AOM_AV1_COMMON_INTRIN_SSE4_1
${AOM_AV1_COMMON_INTRIN_SSE4_1}
"${AOM_ROOT}/av1/common/x86/intra_edge_sse4.c")
endif ()
if (CONFIG_PVQ)
set(AOM_AV1_COMMON_SOURCES
${AOM_AV1_COMMON_SOURCES}
......
......@@ -85,6 +85,9 @@ AV1_COMMON_SRCS-yes += common/restoration.h
AV1_COMMON_SRCS-yes += common/restoration.c
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/selfguided_sse4.c
endif
ifeq ($(CONFIG_INTRA_EDGE),yes)
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/intra_edge_sse4.c
endif
ifeq (yes,$(filter $(CONFIG_GLOBAL_MOTION) $(CONFIG_WARPED_MOTION),yes))
AV1_COMMON_SRCS-yes += common/warped_motion.h
AV1_COMMON_SRCS-yes += common/warped_motion.c
......
......@@ -665,4 +665,14 @@ if (aom_config("CONFIG_CONVOLVE_ROUND") eq "yes") {
}
}
# INTRA_EDGE functions
if (aom_config("CONFIG_INTRA_EDGE") eq "yes") {
add_proto qw/void av1_filter_intra_edge/, "uint8_t *p, int sz, int strength";
specialize qw/av1_filter_intra_edge sse4_1/;
if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
add_proto qw/void av1_filter_intra_edge_high/, "uint16_t *p, int sz, int strength";
specialize qw/av1_filter_intra_edge_high sse4_1/;
}
}
1;
......@@ -2326,7 +2326,7 @@ static int intra_edge_filter_strength(int bsz, int delta) {
return strength;
}
static void filter_intra_edge(uint8_t *p, int sz, int strength) {
void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength) {
if (!strength) return;
const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = {
......@@ -2350,7 +2350,7 @@ static void filter_intra_edge(uint8_t *p, int sz, int strength) {
}
#if CONFIG_HIGHBITDEPTH
static void filter_intra_edge_high(uint16_t *p, int sz, int strength) {
void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength) {
if (!strength) return;
const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = {
......@@ -2440,8 +2440,8 @@ static void build_intra_predictors_high(
int i;
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
DECLARE_ALIGNED(16, uint16_t, left_data[MAX_TX_SIZE * 2 + 16]);
DECLARE_ALIGNED(16, uint16_t, above_data[MAX_TX_SIZE * 2 + 16]);
DECLARE_ALIGNED(16, uint16_t, left_data[MAX_TX_SIZE * 2 + 32]);
DECLARE_ALIGNED(16, uint16_t, above_data[MAX_TX_SIZE * 2 + 32]);
uint16_t *const above_row = above_data + 16;
uint16_t *const left_col = left_data + 16;
const int txwpx = tx_size_wide[tx_size];
......@@ -2632,13 +2632,13 @@ static void build_intra_predictors_high(
if (need_above && n_top_px > 0) {
const int strength = intra_edge_filter_strength(txwpx, p_angle - 90);
const int n_px = n_top_px + ab_le + (need_right ? n_topright_px : 0);
filter_intra_edge_high(above_row - ab_le, n_px, strength);
av1_filter_intra_edge_high(above_row - ab_le, n_px, strength);
}
if (need_left && n_left_px > 0) {
const int strength = intra_edge_filter_strength(txhpx, p_angle - 180);
const int n_px =
n_left_px + ab_le + (need_bottom ? n_bottomleft_px : 0);
filter_intra_edge_high(left_col - ab_le, n_px, strength);
av1_filter_intra_edge_high(left_col - ab_le, n_px, strength);
}
}
#if CONFIG_INTRA_EDGE_UPSAMPLE
......@@ -2684,8 +2684,8 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
int plane) {
int i;
const uint8_t *above_ref = ref - ref_stride;
DECLARE_ALIGNED(16, uint8_t, left_data[MAX_TX_SIZE * 2 + 16]);
DECLARE_ALIGNED(16, uint8_t, above_data[MAX_TX_SIZE * 2 + 16]);
DECLARE_ALIGNED(16, uint8_t, left_data[MAX_TX_SIZE * 2 + 32]);
DECLARE_ALIGNED(16, uint8_t, above_data[MAX_TX_SIZE * 2 + 32]);
uint8_t *const above_row = above_data + 16;
uint8_t *const left_col = left_data + 16;
const int txwpx = tx_size_wide[tx_size];
......@@ -2874,13 +2874,13 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
if (need_above && n_top_px > 0) {
const int strength = intra_edge_filter_strength(txwpx, p_angle - 90);
const int n_px = n_top_px + ab_le + (need_right ? n_topright_px : 0);
filter_intra_edge(above_row - ab_le, n_px, strength);
av1_filter_intra_edge(above_row - ab_le, n_px, strength);
}
if (need_left && n_left_px > 0) {
const int strength = intra_edge_filter_strength(txhpx, p_angle - 180);
const int n_px =
n_left_px + ab_le + (need_bottom ? n_bottomleft_px : 0);
filter_intra_edge(left_col - ab_le, n_px, strength);
av1_filter_intra_edge(left_col - ab_le, n_px, strength);
}
}
#if CONFIG_INTRA_EDGE_UPSAMPLE
......
/*
* Copyright (c) 2017, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <assert.h>
#include <smmintrin.h>
#include "./aom_config.h"
#include "./av1_rtcd.h"
void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) {
if (!strength) return;
DECLARE_ALIGNED(16, static const int8_t, kern[3][16]) = {
{ 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0 }, // strength 1: 4,8,4
{ 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0 }, // strength 2: 5,6,5
{ 2, 4, 4, 4, 2, 0, 0, 0, 2, 4, 4, 4, 2, 0, 0, 0 } // strength 3: 2,4,4,4,2
};
DECLARE_ALIGNED(16, static const int8_t, v_const[5][16]) = {
{ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
{ 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 },
{ 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 },
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
};
// Extend the first and last samples to simplify the loop for the 5-tap case
if (strength == 3) {
p[-1] = p[0];
p[sz] = p[sz - 1];
}
// Adjust input pointer for filter support area
uint8_t *in = (strength == 3) ? p - 1 : p;
// Avoid modifying first/last samples
uint8_t *out = p + 1;
int len = sz - 2;
const int use_3tap_filter = (strength < 3);
if (use_3tap_filter) {
__m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
__m128i shuf0 = _mm_lddqu_si128((__m128i const *)v_const[0]);
__m128i shuf1 = _mm_lddqu_si128((__m128i const *)v_const[1]);
__m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
__m128i in0 = _mm_lddqu_si128((__m128i *)in);
while (len > 0) {
int n_out = (len < 8) ? len : 8;
__m128i d0 = _mm_shuffle_epi8(in0, shuf0);
__m128i d1 = _mm_shuffle_epi8(in0, shuf1);
d0 = _mm_maddubs_epi16(d0, coef0);
d1 = _mm_maddubs_epi16(d1, coef0);
d0 = _mm_hadd_epi16(d0, d1);
__m128i eight = _mm_set1_epi16(8);
d0 = _mm_add_epi16(d0, eight);
d0 = _mm_srai_epi16(d0, 4);
d0 = _mm_packus_epi16(d0, d0);
__m128i out0 = _mm_lddqu_si128((__m128i *)out);
__m128i n0 = _mm_set1_epi8(n_out);
__m128i mask = _mm_cmpgt_epi8(n0, iden);
out0 = _mm_blendv_epi8(out0, d0, mask);
_mm_storel_epi64((__m128i *)out, out0);
__m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
in0 = _mm_alignr_epi8(in1, in0, 8);
in += 8;
out += 8;
len -= n_out;
}
} else { // 5-tap filter
__m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
__m128i two = _mm_set1_epi8(2);
__m128i shuf_a = _mm_lddqu_si128((__m128i const *)v_const[2]);
__m128i shuf_b = _mm_add_epi8(shuf_a, two);
__m128i shuf_c = _mm_add_epi8(shuf_b, two);
__m128i shuf_d = _mm_add_epi8(shuf_c, two);
__m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
__m128i in0 = _mm_lddqu_si128((__m128i *)in);
while (len > 0) {
int n_out = (len < 8) ? len : 8;
__m128i d0 = _mm_shuffle_epi8(in0, shuf_a);
__m128i d1 = _mm_shuffle_epi8(in0, shuf_b);
__m128i d2 = _mm_shuffle_epi8(in0, shuf_c);
__m128i d3 = _mm_shuffle_epi8(in0, shuf_d);
d0 = _mm_maddubs_epi16(d0, coef0);
d1 = _mm_maddubs_epi16(d1, coef0);
d2 = _mm_maddubs_epi16(d2, coef0);
d3 = _mm_maddubs_epi16(d3, coef0);
d0 = _mm_hadd_epi16(d0, d1);
d2 = _mm_hadd_epi16(d2, d3);
d0 = _mm_hadd_epi16(d0, d2);
__m128i eight = _mm_set1_epi16(8);
d0 = _mm_add_epi16(d0, eight);
d0 = _mm_srai_epi16(d0, 4);
d0 = _mm_packus_epi16(d0, d0);
__m128i out0 = _mm_lddqu_si128((__m128i *)out);
__m128i n0 = _mm_set1_epi8(n_out);
__m128i mask = _mm_cmpgt_epi8(n0, iden);
out0 = _mm_blendv_epi8(out0, d0, mask);
_mm_storel_epi64((__m128i *)out, out0);
__m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
in0 = _mm_alignr_epi8(in1, in0, 8);
in += 8;
out += 8;
len -= n_out;
}
}
}
void av1_filter_intra_edge_high_sse4_1(uint16_t *p, int sz, int strength) {
if (!strength) return;
DECLARE_ALIGNED(16, static const int16_t, kern[3][8]) = {
{ 4, 8, 4, 8, 4, 8, 4, 8 }, // strength 1: 4,8,4
{ 5, 6, 5, 6, 5, 6, 5, 6 }, // strength 2: 5,6,5
{ 2, 4, 2, 4, 2, 4, 2, 4 } // strength 3: 2,4,4,4,2
};
DECLARE_ALIGNED(16, static const int16_t,
v_const[1][8]) = { { 0, 1, 2, 3, 4, 5, 6, 7 } };
// Extend the first and last samples to simplify the loop for the 5-tap case
if (strength == 3) {
p[-1] = p[0];
p[sz] = p[sz - 1];
}
// Adjust input pointer for filter support area
uint16_t *in = (strength == 3) ? p - 1 : p;
// Avoid modifying first/last samples
uint16_t *out = p + 1;
int len = sz - 2;
const int use_3tap_filter = (strength < 3);
if (use_3tap_filter) {
__m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
__m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
__m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
__m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
while (len > 0) {
int n_out = (len < 8) ? len : 8;
__m128i in1 = _mm_alignr_epi8(in8, in0, 2);
__m128i in2 = _mm_alignr_epi8(in8, in0, 4);
__m128i in02 = _mm_add_epi16(in0, in2);
__m128i d0 = _mm_unpacklo_epi16(in02, in1);
__m128i d1 = _mm_unpackhi_epi16(in02, in1);
d0 = _mm_mullo_epi16(d0, coef0);
d1 = _mm_mullo_epi16(d1, coef0);
d0 = _mm_hadd_epi16(d0, d1);
__m128i eight = _mm_set1_epi16(8);
d0 = _mm_add_epi16(d0, eight);
d0 = _mm_srli_epi16(d0, 4);
__m128i out0 = _mm_lddqu_si128((__m128i *)out);
__m128i n0 = _mm_set1_epi16(n_out);
__m128i mask = _mm_cmpgt_epi16(n0, iden);
out0 = _mm_blendv_epi8(out0, d0, mask);
_mm_storeu_si128((__m128i *)out, out0);
in += 8;
in0 = in8;
in8 = _mm_lddqu_si128((__m128i *)&in[8]);
out += 8;
len -= n_out;
}
} else { // 5-tap filter
__m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
__m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
__m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
__m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
while (len > 0) {
int n_out = (len < 8) ? len : 8;
__m128i in1 = _mm_alignr_epi8(in8, in0, 2);
__m128i in2 = _mm_alignr_epi8(in8, in0, 4);
__m128i in3 = _mm_alignr_epi8(in8, in0, 6);
__m128i in4 = _mm_alignr_epi8(in8, in0, 8);
__m128i in04 = _mm_add_epi16(in0, in4);
__m128i in123 = _mm_add_epi16(in1, in2);
in123 = _mm_add_epi16(in123, in3);
__m128i d0 = _mm_unpacklo_epi16(in04, in123);
__m128i d1 = _mm_unpackhi_epi16(in04, in123);
d0 = _mm_mullo_epi16(d0, coef0);
d1 = _mm_mullo_epi16(d1, coef0);
d0 = _mm_hadd_epi16(d0, d1);
__m128i eight = _mm_set1_epi16(8);
d0 = _mm_add_epi16(d0, eight);
d0 = _mm_srli_epi16(d0, 4);
__m128i out0 = _mm_lddqu_si128((__m128i *)out);
__m128i n0 = _mm_set1_epi16(n_out);
__m128i mask = _mm_cmpgt_epi16(n0, iden);
out0 = _mm_blendv_epi8(out0, d0, mask);
_mm_storeu_si128((__m128i *)out, out0);
in += 8;
in0 = in8;
in8 = _mm_lddqu_si128((__m128i *)&in[8]);
out += 8;
len -= n_out;
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment