Commit ad0196b8 authored by Joe Young's avatar Joe Young
Browse files

[intra-edge] Vectorize upsampling

Add sse4_1 functions for Intra-edge experiment:
  av1_upsample_intra_edge_sse4_1()
  av1_upsample_intra_edge_high_sse4_1()

Approx cycle reduction at qp 20, 1 kf:
  Enc:  0.5% to 0.3%
  Dec:  0.4% to 0.2%

Change-Id: I97f0eee09b78218b418b484d80c338cec037f1b9
parent 676c25cd
......@@ -669,9 +669,13 @@ if (aom_config("CONFIG_CONVOLVE_ROUND") eq "yes") {
if (aom_config("CONFIG_INTRA_EDGE") eq "yes") {
add_proto qw/void av1_filter_intra_edge/, "uint8_t *p, int sz, int strength";
specialize qw/av1_filter_intra_edge sse4_1/;
add_proto qw/void av1_upsample_intra_edge/, "uint8_t *p, int sz";
specialize qw/av1_upsample_intra_edge sse4_1/;
if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
add_proto qw/void av1_filter_intra_edge_high/, "uint16_t *p, int sz, int strength";
specialize qw/av1_filter_intra_edge_high sse4_1/;
add_proto qw/void av1_upsample_intra_edge_high/, "uint16_t *p, int sz, int bd";
specialize qw/av1_upsample_intra_edge_high sse4_1/;
}
}
......
......@@ -2380,7 +2380,7 @@ static int use_intra_edge_upsample(int bsz, int delta) {
return (bsz == 4 && d > 0 && d < 56);
}
static void upsample_intra_edge(uint8_t *p, int sz) {
void av1_upsample_intra_edge_c(uint8_t *p, int sz) {
// interpolate half-sample positions
assert(sz <= MAX_UPSAMPLE_SZ);
......@@ -2404,7 +2404,7 @@ static void upsample_intra_edge(uint8_t *p, int sz) {
}
#if CONFIG_HIGHBITDEPTH
static void upsample_intra_edge_high(uint16_t *p, int sz, int bd) {
void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd) {
// interpolate half-sample positions
assert(sz <= MAX_UPSAMPLE_SZ);
......@@ -2645,12 +2645,12 @@ static void build_intra_predictors_high(
const int upsample_above = use_intra_edge_upsample(txwpx, p_angle - 90);
if (upsample_above) {
const int n_px = txwpx + (need_right ? txhpx : 0);
upsample_intra_edge_high(above_row, n_px, xd->bd);
av1_upsample_intra_edge_high(above_row, n_px, xd->bd);
}
const int upsample_left = use_intra_edge_upsample(txhpx, p_angle - 180);
if (upsample_left) {
const int n_px = txhpx + (need_bottom ? txwpx : 0);
upsample_intra_edge_high(left_col, n_px, xd->bd);
av1_upsample_intra_edge_high(left_col, n_px, xd->bd);
}
#endif // CONFIG_INTRA_EDGE_UPSAMPLE
#endif // CONFIG_INTRA_EDGE
......@@ -2887,12 +2887,12 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
const int upsample_above = use_intra_edge_upsample(txwpx, p_angle - 90);
if (upsample_above) {
const int n_px = txwpx + (need_right ? txhpx : 0);
upsample_intra_edge(above_row, n_px);
av1_upsample_intra_edge(above_row, n_px);
}
const int upsample_left = use_intra_edge_upsample(txhpx, p_angle - 180);
if (upsample_left) {
const int n_px = txhpx + (need_bottom ? txwpx : 0);
upsample_intra_edge(left_col, n_px);
av1_upsample_intra_edge(left_col, n_px);
}
#endif // CONFIG_INTRA_EDGE_UPSAMPLE
#endif // CONFIG_INTRA_EDGE
......
......@@ -205,3 +205,116 @@ void av1_filter_intra_edge_high_sse4_1(uint16_t *p, int sz, int strength) {
}
}
}
void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz) {
// interpolate half-sample positions
assert(sz <= 24);
DECLARE_ALIGNED(16, static const int8_t, kernel[1][16]) = {
{ -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 }
};
DECLARE_ALIGNED(16, static const int8_t, v_const[2][16]) = {
{ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
{ 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
};
// Extend first/last samples (upper-left p[-1], last p[sz-1])
// to support 4-tap filter
p[-2] = p[-1];
p[sz] = p[sz - 1];
uint8_t *in = &p[-2];
uint8_t *out = &p[-2];
int n = sz + 1; // Input length including upper-left sample
__m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
__m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
__m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
__m128i shuf0 = _mm_lddqu_si128((__m128i *)v_const[0]);
__m128i shuf1 = _mm_lddqu_si128((__m128i *)v_const[1]);
while (n > 0) {
__m128i in8 = _mm_alignr_epi8(in16, in0, 8);
__m128i d0 = _mm_shuffle_epi8(in0, shuf0);
__m128i d1 = _mm_shuffle_epi8(in0, shuf1);
__m128i d2 = _mm_shuffle_epi8(in8, shuf0);
__m128i d3 = _mm_shuffle_epi8(in8, shuf1);
d0 = _mm_maddubs_epi16(d0, coef0);
d1 = _mm_maddubs_epi16(d1, coef0);
d2 = _mm_maddubs_epi16(d2, coef0);
d3 = _mm_maddubs_epi16(d3, coef0);
d0 = _mm_hadd_epi16(d0, d1);
d2 = _mm_hadd_epi16(d2, d3);
__m128i eight = _mm_set1_epi16(8);
d0 = _mm_add_epi16(d0, eight);
d2 = _mm_add_epi16(d2, eight);
d0 = _mm_srai_epi16(d0, 4);
d2 = _mm_srai_epi16(d2, 4);
d0 = _mm_packus_epi16(d0, d2);
__m128i in1 = _mm_alignr_epi8(in16, in0, 1);
__m128i out0 = _mm_unpacklo_epi8(in1, d0);
__m128i out1 = _mm_unpackhi_epi8(in1, d0);
_mm_storeu_si128((__m128i *)&out[0], out0);
_mm_storeu_si128((__m128i *)&out[16], out1);
in0 = in16;
in16 = _mm_setzero_si128();
out += 32;
n -= 16;
}
}
void av1_upsample_intra_edge_high_sse4_1(uint16_t *p, int sz, int bd) {
// interpolate half-sample positions
assert(sz <= 24);
DECLARE_ALIGNED(16, static const int16_t,
kernel[1][8]) = { { -1, 9, -1, 9, -1, 9, -1, 9 } };
// Extend first/last samples (upper-left p[-1], last p[sz-1])
// to support 4-tap filter
p[-2] = p[-1];
p[sz] = p[sz - 1];
uint16_t *in = &p[-2];
uint16_t *out = in;
int n = sz + 1;
__m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
__m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
__m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
__m128i in24 = _mm_lddqu_si128((__m128i *)&in[24]);
while (n > 0) {
__m128i in1 = _mm_alignr_epi8(in8, in0, 2);
__m128i in2 = _mm_alignr_epi8(in8, in0, 4);
__m128i in3 = _mm_alignr_epi8(in8, in0, 6);
__m128i sum0 = _mm_add_epi16(in0, in3);
__m128i sum1 = _mm_add_epi16(in1, in2);
__m128i d0 = _mm_unpacklo_epi16(sum0, sum1);
__m128i d1 = _mm_unpackhi_epi16(sum0, sum1);
__m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
d0 = _mm_madd_epi16(d0, coef0);
d1 = _mm_madd_epi16(d1, coef0);
__m128i eight = _mm_set1_epi32(8);
d0 = _mm_add_epi32(d0, eight);
d1 = _mm_add_epi32(d1, eight);
d0 = _mm_srai_epi32(d0, 4);
d1 = _mm_srai_epi32(d1, 4);
d0 = _mm_packus_epi32(d0, d1);
__m128i max0 = _mm_set1_epi16((1 << bd) - 1);
d0 = _mm_min_epi16(d0, max0);
__m128i out0 = _mm_unpacklo_epi16(in1, d0);
__m128i out1 = _mm_unpackhi_epi16(in1, d0);
_mm_storeu_si128((__m128i *)&out[0], out0);
_mm_storeu_si128((__m128i *)&out[8], out1);
in0 = in8;
in8 = in16;
in16 = in24;
in24 = _mm_setzero_si128();
out += 16;
n -= 8;
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment