Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
98e9ce92
Commit
98e9ce92
authored
Oct 13, 2016
by
Yaowu Xu
Committed by
Gerrit Code Review
Oct 13, 2016
Browse files
Merge "Add SSE4.1 code for deringing functions." into nextgenv2
parents
3feb8917
7227b65c
Changes
8
Hide whitespace changes
Inline
Side-by-side
av1/av1_common.mk
View file @
98e9ce92
...
...
@@ -98,6 +98,8 @@ endif
ifeq
($(CONFIG_DERING),yes)
AV1_COMMON_SRCS-yes
+=
common/od_dering.c
AV1_COMMON_SRCS-yes
+=
common/od_dering.h
AV1_COMMON_SRCS-$(HAVE_SSE4_1)
+=
common/x86/od_dering_sse4.c
AV1_COMMON_SRCS-$(HAVE_SSE4_1)
+=
common/x86/od_dering_sse4.h
AV1_COMMON_SRCS-yes
+=
common/dering.c
AV1_COMMON_SRCS-yes
+=
common/dering.h
endif
...
...
av1/common/av1_rtcd_defs.pl
View file @
98e9ce92
...
...
@@ -20,6 +20,7 @@ struct search_site_config;
struct mv;
union int_mv;
struct yv12_buffer_config;
typedef int16_t od_dering_in;
EOF
}
forward_decls
qw/av1_common_forward_decls/
;
...
...
@@ -840,4 +841,24 @@ if (aom_config("CONFIG_EXT_INTER") eq "yes") {
}
# end encoder functions
# Deringing Functions
if
(
aom_config
("
CONFIG_DERING
")
eq
"
yes
")
{
add_proto
qw/int od_dir_find8/
,
"
const od_dering_in *img, int stride, int32_t *var, int coeff_shift
";
specialize
qw/od_dir_find8 sse4_1/
;
add_proto
qw/int od_filter_dering_direction_4x4/
,
"
int16_t *y, int ystride, const int16_t *in, int threshold, int dir
";
specialize
qw/od_filter_dering_direction_4x4 sse4_1/
;
add_proto
qw/int od_filter_dering_direction_8x8/
,
"
int16_t *y, int ystride, const int16_t *in, int threshold, int dir
";
specialize
qw/od_filter_dering_direction_8x8 sse4_1/
;
add_proto
qw/void od_filter_dering_orthogonal_4x4/
,
"
int16_t *y, int ystride, const int16_t *in, int threshold, int dir
";
specialize
qw/od_filter_dering_orthogonal_4x4 sse4_1/
;
add_proto
qw/void od_filter_dering_orthogonal_8x8/
,
"
int16_t *y, int ystride, const int16_t *in, int threshold, int dir
";
specialize
qw/od_filter_dering_orthogonal_8x8 sse4_1/
;
}
1
;
av1/common/dering.c
View file @
98e9ce92
...
...
@@ -111,7 +111,7 @@ void av1_dering_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
if
(
pli
)
level
=
(
level
*
5
+
4
)
>>
3
;
if
(
sb_all_skip
(
cm
,
sbr
*
MAX_MIB_SIZE
,
sbc
*
MAX_MIB_SIZE
))
continue
;
threshold
=
level
<<
coeff_shift
;
od_dering
(
&
OD_DERING_VTBL_C
,
dst
,
MAX_MIB_SIZE
*
bsize
[
pli
],
od_dering
(
dst
,
MAX_MIB_SIZE
*
bsize
[
pli
],
&
src
[
pli
][
sbr
*
stride
*
bsize
[
pli
]
*
MAX_MIB_SIZE
+
sbc
*
bsize
[
pli
]
*
MAX_MIB_SIZE
],
stride
,
nhb
,
nvb
,
sbc
,
sbr
,
nhsb
,
nvsb
,
dec
[
pli
],
dir
,
pli
,
...
...
av1/common/od_dering.c
View file @
98e9ce92
...
...
@@ -15,11 +15,7 @@
#include <stdlib.h>
#include <math.h>
#include "dering.h"
const
od_dering_opt_vtbl
OD_DERING_VTBL_C
=
{
{
od_filter_dering_direction_4x4_c
,
od_filter_dering_direction_8x8_c
},
{
od_filter_dering_orthogonal_4x4_c
,
od_filter_dering_orthogonal_8x8_c
}
};
#include "./av1_rtcd.h"
/* Generated from gen_filter_tables.c. */
const
int
OD_DIRECTION_OFFSETS_TABLE
[
8
][
3
]
=
{
...
...
@@ -42,8 +38,8 @@ const int OD_DIRECTION_OFFSETS_TABLE[8][3] = {
in a particular direction. Since each direction have the same sum(x^2) term,
that term is never computed. See Section 2, step 2, of:
http://jmvalin.ca/notes/intra_paint.pdf */
static
int
od_dir_find8
(
const
od_dering_in
*
img
,
int
stride
,
int32_t
*
var
,
int
coeff_shift
)
{
int
od_dir_find8
_c
(
const
od_dering_in
*
img
,
int
stride
,
int32_t
*
var
,
int
coeff_shift
)
{
int
i
;
int32_t
cost
[
8
]
=
{
0
};
int
partial
[
8
][
15
]
=
{
{
0
}
};
...
...
@@ -273,9 +269,8 @@ static void od_compute_thresh(int thresh[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS],
}
}
void
od_dering
(
const
od_dering_opt_vtbl
*
vtbl
,
int16_t
*
y
,
int
ystride
,
const
od_dering_in
*
x
,
int
xstride
,
int
nhb
,
int
nvb
,
int
sbx
,
int
sby
,
int
nhsb
,
int
nvsb
,
int
xdec
,
void
od_dering
(
int16_t
*
y
,
int
ystride
,
const
od_dering_in
*
x
,
int
xstride
,
int
nhb
,
int
nvb
,
int
sbx
,
int
sby
,
int
nhsb
,
int
nvsb
,
int
xdec
,
int
dir
[
OD_DERING_NBLOCKS
][
OD_DERING_NBLOCKS
],
int
pli
,
unsigned
char
*
bskip
,
int
skip_stride
,
int
threshold
,
int
coeff_shift
)
{
...
...
@@ -289,6 +284,12 @@ void od_dering(const od_dering_opt_vtbl *vtbl, int16_t *y, int ystride,
int32_t
var
[
OD_DERING_NBLOCKS
][
OD_DERING_NBLOCKS
];
int
thresh
[
OD_DERING_NBLOCKS
][
OD_DERING_NBLOCKS
];
int
thresh2
[
OD_DERING_NBLOCKS
][
OD_DERING_NBLOCKS
];
od_filter_dering_direction_func
filter_dering_direction
[
OD_DERINGSIZES
]
=
{
od_filter_dering_direction_4x4
,
od_filter_dering_direction_8x8
};
od_filter_dering_orthogonal_func
filter_dering_orthogonal
[
OD_DERINGSIZES
]
=
{
od_filter_dering_orthogonal_4x4
,
od_filter_dering_orthogonal_8x8
};
bsize
=
3
-
xdec
;
in
=
inbuf
+
OD_FILT_BORDER
*
OD_FILT_BSTRIDE
+
OD_FILT_BORDER
;
/* We avoid filtering the pixels for which some of the pixels to average
...
...
@@ -340,7 +341,7 @@ void od_dering(const od_dering_opt_vtbl *vtbl, int16_t *y, int ystride,
to be a little bit more aggressive on pure horizontal/vertical
since the ringing there tends to be directional, so it doesn't
get removed by the directional filtering. */
thresh2
[
by
][
bx
]
=
(
vtbl
->
filter_dering_direction
[
bsize
-
OD_LOG_BSIZE0
])(
thresh2
[
by
][
bx
]
=
(
filter_dering_direction
[
bsize
-
OD_LOG_BSIZE0
])(
&
y
[(
by
*
ystride
<<
bsize
)
+
(
bx
<<
bsize
)],
ystride
,
&
in
[(
by
*
OD_FILT_BSTRIDE
<<
bsize
)
+
(
bx
<<
bsize
)],
thresh
[
by
][
bx
],
dir
[
by
][
bx
]);
...
...
@@ -354,7 +355,7 @@ void od_dering(const od_dering_opt_vtbl *vtbl, int16_t *y, int ystride,
for
(
by
=
0
;
by
<
nvb
;
by
++
)
{
for
(
bx
=
0
;
bx
<
nhb
;
bx
++
)
{
if
(
thresh
[
by
][
bx
]
==
0
)
continue
;
(
vtbl
->
filter_dering_orthogonal
[
bsize
-
OD_LOG_BSIZE0
])(
(
filter_dering_orthogonal
[
bsize
-
OD_LOG_BSIZE0
])(
&
y
[(
by
*
ystride
<<
bsize
)
+
(
bx
<<
bsize
)],
ystride
,
&
in
[(
by
*
OD_FILT_BSTRIDE
<<
bsize
)
+
(
bx
<<
bsize
)],
thresh2
[
by
][
bx
],
dir
[
by
][
bx
]);
...
...
av1/common/od_dering.h
View file @
98e9ce92
...
...
@@ -34,27 +34,11 @@ typedef int (*od_filter_dering_direction_func)(int16_t *y, int ystride,
typedef
void
(
*
od_filter_dering_orthogonal_func
)(
int16_t
*
y
,
int
ystride
,
const
int16_t
*
in
,
int
threshold
,
int
dir
);
struct
od_dering_opt_vtbl
{
od_filter_dering_direction_func
filter_dering_direction
[
OD_DERINGSIZES
];
od_filter_dering_orthogonal_func
filter_dering_orthogonal
[
OD_DERINGSIZES
];
};
typedef
struct
od_dering_opt_vtbl
od_dering_opt_vtbl
;
void
od_dering
(
const
od_dering_opt_vtbl
*
vtbl
,
int16_t
*
y
,
int
ystride
,
const
od_dering_in
*
x
,
int
xstride
,
int
nvb
,
int
nhb
,
int
sbx
,
int
sby
,
int
nhsb
,
int
nvsb
,
int
xdec
,
void
od_dering
(
int16_t
*
y
,
int
ystride
,
const
od_dering_in
*
x
,
int
xstride
,
int
nvb
,
int
nhb
,
int
sbx
,
int
sby
,
int
nhsb
,
int
nvsb
,
int
xdec
,
int
dir
[
OD_DERING_NBLOCKS
][
OD_DERING_NBLOCKS
],
int
pli
,
unsigned
char
*
bskip
,
int
skip_stride
,
int
threshold
,
int
coeff_shift
);
void
od_filter_dering_direction_c
(
int16_t
*
y
,
int
ystride
,
const
int16_t
*
in
,
int
ln
,
int
threshold
,
int
dir
);
void
od_filter_dering_orthogonal_c
(
int16_t
*
y
,
int
ystride
,
const
int16_t
*
in
,
const
od_dering_in
*
x
,
int
xstride
,
int
ln
,
int
threshold
,
int
dir
);
extern
const
od_dering_opt_vtbl
OD_DERING_VTBL_C
;
int
od_filter_dering_direction_4x4_c
(
int16_t
*
y
,
int
ystride
,
const
int16_t
*
in
,
int
threshold
,
int
dir
);
int
od_filter_dering_direction_8x8_c
(
int16_t
*
y
,
int
ystride
,
const
int16_t
*
in
,
...
...
@@ -65,5 +49,4 @@ void od_filter_dering_orthogonal_4x4_c(int16_t *y, int ystride,
void
od_filter_dering_orthogonal_8x8_c
(
int16_t
*
y
,
int
ystride
,
const
int16_t
*
in
,
int
threshold
,
int
dir
);
#endif
av1/common/x86/od_dering_sse4.c
0 → 100644
View file @
98e9ce92
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <smmintrin.h>
#include <emmintrin.h>
#include <tmmintrin.h>
#include "./av1_rtcd.h"
#include "av1/common/x86/od_dering_sse4.h"
/* partial A is a 16-bit vector of the form:
[x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
[0 y1 y2 y3 y4 y5 y6 y7].
This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
(x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
and const2. */
static
INLINE
__m128i
fold_mul_and_sum
(
__m128i
partiala
,
__m128i
partialb
,
__m128i
const1
,
__m128i
const2
)
{
__m128i
tmp
;
/* Reverse partial B. */
partialb
=
_mm_shuffle_epi8
(
partialb
,
_mm_set_epi8
(
15
,
14
,
1
,
0
,
3
,
2
,
5
,
4
,
7
,
6
,
9
,
8
,
11
,
10
,
13
,
12
));
/* Interleave the x and y values of identical indices and pair x8 with 0. */
tmp
=
partiala
;
partiala
=
_mm_unpacklo_epi16
(
partiala
,
partialb
);
partialb
=
_mm_unpackhi_epi16
(
tmp
,
partialb
);
/* Square and add the corresponding x and y values. */
partiala
=
_mm_madd_epi16
(
partiala
,
partiala
);
partialb
=
_mm_madd_epi16
(
partialb
,
partialb
);
/* Multiply by constant. */
partiala
=
_mm_mullo_epi32
(
partiala
,
const1
);
partialb
=
_mm_mullo_epi32
(
partialb
,
const2
);
/* Sum all results. */
partiala
=
_mm_add_epi32
(
partiala
,
partialb
);
return
partiala
;
}
static
INLINE
__m128i
hsum4
(
__m128i
x0
,
__m128i
x1
,
__m128i
x2
,
__m128i
x3
)
{
__m128i
t0
,
t1
,
t2
,
t3
;
t0
=
_mm_unpacklo_epi32
(
x0
,
x1
);
t1
=
_mm_unpacklo_epi32
(
x2
,
x3
);
t2
=
_mm_unpackhi_epi32
(
x0
,
x1
);
t3
=
_mm_unpackhi_epi32
(
x2
,
x3
);
x0
=
_mm_unpacklo_epi64
(
t0
,
t1
);
x1
=
_mm_unpackhi_epi64
(
t0
,
t1
);
x2
=
_mm_unpacklo_epi64
(
t2
,
t3
);
x3
=
_mm_unpackhi_epi64
(
t2
,
t3
);
return
_mm_add_epi32
(
_mm_add_epi32
(
x0
,
x1
),
_mm_add_epi32
(
x2
,
x3
));
}
/* Horizontal sum of 8x16-bit unsigned values. */
static
INLINE
int32_t
hsum_epi16
(
__m128i
a
)
{
a
=
_mm_madd_epi16
(
a
,
_mm_set1_epi16
(
1
));
a
=
_mm_hadd_epi32
(
a
,
a
);
a
=
_mm_hadd_epi32
(
a
,
a
);
return
_mm_cvtsi128_si32
(
a
);
}
/* Computes cost for directions 0, 5, 6 and 7. We can call this function again
to compute the remaining directions. */
static
INLINE
__m128i
compute_directions
(
__m128i
lines
[
8
],
int32_t
tmp_cost1
[
4
])
{
__m128i
partial4a
,
partial4b
,
partial5a
,
partial5b
,
partial7a
,
partial7b
;
__m128i
partial6
;
__m128i
tmp
;
/* Partial sums for lines 0 and 1. */
partial4a
=
_mm_slli_si128
(
lines
[
0
],
14
);
partial4b
=
_mm_srli_si128
(
lines
[
0
],
2
);
partial4a
=
_mm_add_epi16
(
partial4a
,
_mm_slli_si128
(
lines
[
1
],
12
));
partial4b
=
_mm_add_epi16
(
partial4b
,
_mm_srli_si128
(
lines
[
1
],
4
));
tmp
=
_mm_add_epi16
(
lines
[
0
],
lines
[
1
]);
partial5a
=
_mm_slli_si128
(
tmp
,
10
);
partial5b
=
_mm_srli_si128
(
tmp
,
6
);
partial7a
=
_mm_slli_si128
(
tmp
,
4
);
partial7b
=
_mm_srli_si128
(
tmp
,
12
);
partial6
=
tmp
;
/* Partial sums for lines 2 and 3. */
partial4a
=
_mm_add_epi16
(
partial4a
,
_mm_slli_si128
(
lines
[
2
],
10
));
partial4b
=
_mm_add_epi16
(
partial4b
,
_mm_srli_si128
(
lines
[
2
],
6
));
partial4a
=
_mm_add_epi16
(
partial4a
,
_mm_slli_si128
(
lines
[
3
],
8
));
partial4b
=
_mm_add_epi16
(
partial4b
,
_mm_srli_si128
(
lines
[
3
],
8
));
tmp
=
_mm_add_epi16
(
lines
[
2
],
lines
[
3
]);
partial5a
=
_mm_add_epi16
(
partial5a
,
_mm_slli_si128
(
tmp
,
8
));
partial5b
=
_mm_add_epi16
(
partial5b
,
_mm_srli_si128
(
tmp
,
8
));
partial7a
=
_mm_add_epi16
(
partial7a
,
_mm_slli_si128
(
tmp
,
6
));
partial7b
=
_mm_add_epi16
(
partial7b
,
_mm_srli_si128
(
tmp
,
10
));
partial6
=
_mm_add_epi16
(
partial6
,
tmp
);
/* Partial sums for lines 4 and 5. */
partial4a
=
_mm_add_epi16
(
partial4a
,
_mm_slli_si128
(
lines
[
4
],
6
));
partial4b
=
_mm_add_epi16
(
partial4b
,
_mm_srli_si128
(
lines
[
4
],
10
));
partial4a
=
_mm_add_epi16
(
partial4a
,
_mm_slli_si128
(
lines
[
5
],
4
));
partial4b
=
_mm_add_epi16
(
partial4b
,
_mm_srli_si128
(
lines
[
5
],
12
));
tmp
=
_mm_add_epi16
(
lines
[
4
],
lines
[
5
]);
partial5a
=
_mm_add_epi16
(
partial5a
,
_mm_slli_si128
(
tmp
,
6
));
partial5b
=
_mm_add_epi16
(
partial5b
,
_mm_srli_si128
(
tmp
,
10
));
partial7a
=
_mm_add_epi16
(
partial7a
,
_mm_slli_si128
(
tmp
,
8
));
partial7b
=
_mm_add_epi16
(
partial7b
,
_mm_srli_si128
(
tmp
,
8
));
partial6
=
_mm_add_epi16
(
partial6
,
tmp
);
/* Partial sums for lines 6 and 7. */
partial4a
=
_mm_add_epi16
(
partial4a
,
_mm_slli_si128
(
lines
[
6
],
2
));
partial4b
=
_mm_add_epi16
(
partial4b
,
_mm_srli_si128
(
lines
[
6
],
14
));
partial4a
=
_mm_add_epi16
(
partial4a
,
lines
[
7
]);
tmp
=
_mm_add_epi16
(
lines
[
6
],
lines
[
7
]);
partial5a
=
_mm_add_epi16
(
partial5a
,
_mm_slli_si128
(
tmp
,
4
));
partial5b
=
_mm_add_epi16
(
partial5b
,
_mm_srli_si128
(
tmp
,
12
));
partial7a
=
_mm_add_epi16
(
partial7a
,
_mm_slli_si128
(
tmp
,
10
));
partial7b
=
_mm_add_epi16
(
partial7b
,
_mm_srli_si128
(
tmp
,
6
));
partial6
=
_mm_add_epi16
(
partial6
,
tmp
);
/* Compute costs in terms of partial sums. */
partial4a
=
fold_mul_and_sum
(
partial4a
,
partial4b
,
_mm_set_epi32
(
210
,
280
,
420
,
840
),
_mm_set_epi32
(
105
,
120
,
140
,
168
));
partial7a
=
fold_mul_and_sum
(
partial7a
,
partial7b
,
_mm_set_epi32
(
210
,
420
,
0
,
0
),
_mm_set_epi32
(
105
,
105
,
105
,
140
));
partial5a
=
fold_mul_and_sum
(
partial5a
,
partial5b
,
_mm_set_epi32
(
210
,
420
,
0
,
0
),
_mm_set_epi32
(
105
,
105
,
105
,
140
));
partial6
=
_mm_madd_epi16
(
partial6
,
partial6
);
partial6
=
_mm_mullo_epi32
(
partial6
,
_mm_set1_epi32
(
105
));
partial4a
=
hsum4
(
partial4a
,
partial5a
,
partial6
,
partial7a
);
_mm_storeu_si128
((
__m128i
*
)
tmp_cost1
,
partial4a
);
return
partial4a
;
}
/* transpose and reverse the order of the lines -- equivalent to a 90-degree
counter-clockwise rotation of the pixels. */
static
INLINE
void
array_reverse_transpose_8x8
(
__m128i
*
in
,
__m128i
*
res
)
{
const
__m128i
tr0_0
=
_mm_unpacklo_epi16
(
in
[
0
],
in
[
1
]);
const
__m128i
tr0_1
=
_mm_unpacklo_epi16
(
in
[
2
],
in
[
3
]);
const
__m128i
tr0_2
=
_mm_unpackhi_epi16
(
in
[
0
],
in
[
1
]);
const
__m128i
tr0_3
=
_mm_unpackhi_epi16
(
in
[
2
],
in
[
3
]);
const
__m128i
tr0_4
=
_mm_unpacklo_epi16
(
in
[
4
],
in
[
5
]);
const
__m128i
tr0_5
=
_mm_unpacklo_epi16
(
in
[
6
],
in
[
7
]);
const
__m128i
tr0_6
=
_mm_unpackhi_epi16
(
in
[
4
],
in
[
5
]);
const
__m128i
tr0_7
=
_mm_unpackhi_epi16
(
in
[
6
],
in
[
7
]);
const
__m128i
tr1_0
=
_mm_unpacklo_epi32
(
tr0_0
,
tr0_1
);
const
__m128i
tr1_1
=
_mm_unpacklo_epi32
(
tr0_4
,
tr0_5
);
const
__m128i
tr1_2
=
_mm_unpackhi_epi32
(
tr0_0
,
tr0_1
);
const
__m128i
tr1_3
=
_mm_unpackhi_epi32
(
tr0_4
,
tr0_5
);
const
__m128i
tr1_4
=
_mm_unpacklo_epi32
(
tr0_2
,
tr0_3
);
const
__m128i
tr1_5
=
_mm_unpacklo_epi32
(
tr0_6
,
tr0_7
);
const
__m128i
tr1_6
=
_mm_unpackhi_epi32
(
tr0_2
,
tr0_3
);
const
__m128i
tr1_7
=
_mm_unpackhi_epi32
(
tr0_6
,
tr0_7
);
res
[
7
]
=
_mm_unpacklo_epi64
(
tr1_0
,
tr1_1
);
res
[
6
]
=
_mm_unpackhi_epi64
(
tr1_0
,
tr1_1
);
res
[
5
]
=
_mm_unpacklo_epi64
(
tr1_2
,
tr1_3
);
res
[
4
]
=
_mm_unpackhi_epi64
(
tr1_2
,
tr1_3
);
res
[
3
]
=
_mm_unpacklo_epi64
(
tr1_4
,
tr1_5
);
res
[
2
]
=
_mm_unpackhi_epi64
(
tr1_4
,
tr1_5
);
res
[
1
]
=
_mm_unpacklo_epi64
(
tr1_6
,
tr1_7
);
res
[
0
]
=
_mm_unpackhi_epi64
(
tr1_6
,
tr1_7
);
}
int
od_dir_find8_sse4_1
(
const
od_dering_in
*
img
,
int
stride
,
int32_t
*
var
,
int
coeff_shift
)
{
int
i
;
int32_t
cost
[
8
];
int32_t
best_cost
=
0
;
int
best_dir
=
0
;
__m128i
lines
[
8
];
__m128i
dir03
,
dir47
;
__m128i
max
;
for
(
i
=
0
;
i
<
8
;
i
++
)
{
lines
[
i
]
=
_mm_loadu_si128
((
__m128i
*
)
&
img
[
i
*
stride
]);
lines
[
i
]
=
_mm_sub_epi16
(
_mm_srai_epi16
(
lines
[
i
],
coeff_shift
),
_mm_set1_epi16
(
128
));
}
/* Compute "mostly vertical" directions. */
dir47
=
compute_directions
(
lines
,
cost
+
4
);
array_reverse_transpose_8x8
(
lines
,
lines
);
/* Compute "mostly horizontal" directions. */
dir03
=
compute_directions
(
lines
,
cost
);
#if 1
max
=
_mm_max_epi32
(
dir03
,
dir47
);
max
=
_mm_max_epi32
(
max
,
_mm_shuffle_epi32
(
max
,
_MM_SHUFFLE
(
1
,
0
,
3
,
2
)));
max
=
_mm_max_epi32
(
max
,
_mm_shuffle_epi32
(
max
,
_MM_SHUFFLE
(
2
,
3
,
0
,
1
)));
dir03
=
_mm_and_si128
(
_mm_cmpeq_epi32
(
max
,
dir03
),
_mm_setr_epi32
(
-
1
,
-
2
,
-
3
,
-
4
));
dir47
=
_mm_and_si128
(
_mm_cmpeq_epi32
(
max
,
dir47
),
_mm_setr_epi32
(
-
5
,
-
6
,
-
7
,
-
8
));
dir03
=
_mm_max_epu32
(
dir03
,
dir47
);
dir03
=
_mm_max_epu32
(
dir03
,
_mm_unpackhi_epi64
(
dir03
,
dir03
));
dir03
=
_mm_max_epu32
(
dir03
,
_mm_shufflelo_epi16
(
dir03
,
_MM_SHUFFLE
(
1
,
0
,
3
,
2
)));
dir03
=
_mm_xor_si128
(
dir03
,
_mm_set1_epi32
(
0xFFFFFFFF
));
best_dir
=
_mm_cvtsi128_si32
(
dir03
);
best_cost
=
_mm_cvtsi128_si32
(
max
);
#else
for
(
i
=
0
;
i
<
8
;
i
++
)
{
if
(
cost
[
i
]
>
best_cost
)
{
best_cost
=
cost
[
i
];
best_dir
=
i
;
}
}
#endif
/* Difference between the optimal variance and the variance along the
orthogonal direction. Again, the sum(x^2) terms cancel out. */
*
var
=
best_cost
-
cost
[(
best_dir
+
4
)
&
7
];
/* We'd normally divide by 840, but dividing by 1024 is close enough
for what we're going to do with this. */
*
var
>>=
10
;
return
best_dir
;
}
static
INLINE
__m128i
od_cmplt_abs_epi16
(
__m128i
in
,
__m128i
threshold
)
{
return
_mm_cmplt_epi16
(
_mm_abs_epi16
(
in
),
threshold
);
}
int
od_filter_dering_direction_4x4_sse4_1
(
int16_t
*
y
,
int
ystride
,
const
int16_t
*
in
,
int
threshold
,
int
dir
)
{
int
i
;
__m128i
sum
;
__m128i
p
;
__m128i
cmp
;
__m128i
row
;
__m128i
res
;
__m128i
tmp
;
__m128i
thresh
;
__m128i
total_abs
;
int
off1
,
off2
;
off1
=
OD_DIRECTION_OFFSETS_TABLE
[
dir
][
0
];
off2
=
OD_DIRECTION_OFFSETS_TABLE
[
dir
][
1
];
total_abs
=
_mm_setzero_si128
();
thresh
=
_mm_set1_epi16
(
threshold
);
for
(
i
=
0
;
i
<
4
;
i
+=
2
)
{
sum
=
_mm_set1_epi16
(
0
);
row
=
_mm_unpacklo_epi64
(
_mm_loadl_epi64
((
__m128i
*
)
&
in
[
i
*
OD_FILT_BSTRIDE
]),
_mm_loadl_epi64
((
__m128i
*
)
&
in
[(
i
+
1
)
*
OD_FILT_BSTRIDE
]));
/*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
tmp
=
_mm_unpacklo_epi64
(
_mm_loadl_epi64
((
__m128i
*
)
&
in
[
i
*
OD_FILT_BSTRIDE
+
off1
]),
_mm_loadl_epi64
((
__m128i
*
)
&
in
[(
i
+
1
)
*
OD_FILT_BSTRIDE
+
off1
]));
p
=
_mm_sub_epi16
(
tmp
,
row
);
/*if (abs(p) < thresh) sum += taps[k]*p*/
cmp
=
od_cmplt_abs_epi16
(
p
,
thresh
);
p
=
_mm_slli_epi16
(
p
,
2
);
p
=
_mm_and_si128
(
p
,
cmp
);
sum
=
_mm_add_epi16
(
sum
,
p
);
/*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
tmp
=
_mm_unpacklo_epi64
(
_mm_loadl_epi64
((
__m128i
*
)
&
in
[
i
*
OD_FILT_BSTRIDE
-
off1
]),
_mm_loadl_epi64
((
__m128i
*
)
&
in
[(
i
+
1
)
*
OD_FILT_BSTRIDE
-
off1
]));
p
=
_mm_sub_epi16
(
tmp
,
row
);
/*if (abs(p) < thresh) sum += taps[k]*p1*/
cmp
=
od_cmplt_abs_epi16
(
p
,
thresh
);
p
=
_mm_slli_epi16
(
p
,
2
);
p
=
_mm_and_si128
(
p
,
cmp
);
sum
=
_mm_add_epi16
(
sum
,
p
);
/*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
tmp
=
_mm_unpacklo_epi64
(
_mm_loadl_epi64
((
__m128i
*
)
&
in
[
i
*
OD_FILT_BSTRIDE
+
off2
]),
_mm_loadl_epi64
((
__m128i
*
)
&
in
[(
i
+
1
)
*
OD_FILT_BSTRIDE
+
off2
]));
p
=
_mm_sub_epi16
(
tmp
,
row
);
/*if (abs(p) < thresh) sum += taps[k]*p*/
cmp
=
od_cmplt_abs_epi16
(
p
,
thresh
);
p
=
_mm_and_si128
(
p
,
cmp
);
sum
=
_mm_add_epi16
(
sum
,
p
);
/*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
tmp
=
_mm_unpacklo_epi64
(
_mm_loadl_epi64
((
__m128i
*
)
&
in
[
i
*
OD_FILT_BSTRIDE
-
off2
]),
_mm_loadl_epi64
((
__m128i
*
)
&
in
[(
i
+
1
)
*
OD_FILT_BSTRIDE
-
off2
]));
p
=
_mm_sub_epi16
(
tmp
,
row
);
/*if (abs(p) < thresh) sum += taps[k]*p1*/
cmp
=
od_cmplt_abs_epi16
(
p
,
thresh
);
p
=
_mm_and_si128
(
p
,
cmp
);
sum
=
_mm_add_epi16
(
sum
,
p
);
/*res = row + ((sum + 8) >> 4)*/
res
=
_mm_add_epi16
(
sum
,
_mm_set1_epi16
(
8
));
res
=
_mm_srai_epi16
(
res
,
4
);
total_abs
=
_mm_add_epi16
(
total_abs
,
_mm_abs_epi16
(
res
));
res
=
_mm_add_epi16
(
row
,
res
);
_mm_storel_epi64
((
__m128i
*
)
&
y
[
i
*
ystride
],
res
);
_mm_storel_epi64
((
__m128i
*
)
&
y
[(
i
+
1
)
*
ystride
],
_mm_unpackhi_epi64
(
res
,
res
));
}
return
(
hsum_epi16
(
total_abs
)
+
2
)
>>
2
;
}
int
od_filter_dering_direction_8x8_sse4_1
(
int16_t
*
y
,
int
ystride
,
const
int16_t
*
in
,
int
threshold
,
int
dir
)
{
int
i
;
__m128i
sum
;
__m128i
p
;
__m128i
cmp
;
__m128i
row
;
__m128i
res
;
__m128i
thresh
;
__m128i
total_abs
;
int
off1
,
off2
,
off3
;
off1
=
OD_DIRECTION_OFFSETS_TABLE
[
dir
][
0
];
off2
=
OD_DIRECTION_OFFSETS_TABLE
[
dir
][
1
];
off3
=
OD_DIRECTION_OFFSETS_TABLE
[
dir
][
2
];
total_abs
=
_mm_setzero_si128
();
thresh
=
_mm_set1_epi16
(
threshold
);
for
(
i
=
0
;
i
<
8
;
i
++
)
{
sum
=
_mm_set1_epi16
(
0
);
row
=
_mm_loadu_si128
((
__m128i
*
)
&
in
[
i
*
OD_FILT_BSTRIDE
]);
/*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
p
=
_mm_sub_epi16
(
_mm_loadu_si128
((
__m128i
*
)
&
in
[
i
*
OD_FILT_BSTRIDE
+
off1
]),
row
);
/*if (abs(p) < thresh) sum += taps[k]*p*/
cmp
=
od_cmplt_abs_epi16
(
p
,
thresh
);
p
=
_mm_add_epi16
(
p
,
_mm_slli_epi16
(
p
,
1
));
p
=
_mm_and_si128
(
p
,
cmp
);
sum
=
_mm_add_epi16
(
sum
,
p
);
/*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
p
=
_mm_sub_epi16
(
_mm_loadu_si128
((
__m128i
*
)
&
in
[
i
*
OD_FILT_BSTRIDE
-
off1
]),
row
);
/*if (abs(p) < thresh) sum += taps[k]*p1*/
cmp
=
od_cmplt_abs_epi16
(
p
,
thresh
);
p
=
_mm_add_epi16
(
p
,
_mm_slli_epi16
(
p
,
1
));
p
=
_mm_and_si128
(
p
,
cmp
);
sum
=
_mm_add_epi16
(
sum
,
p
);
/*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
p
=
_mm_sub_epi16
(
_mm_loadu_si128
((
__m128i
*
)
&
in
[
i
*
OD_FILT_BSTRIDE
+
off2
]),
row
);
/*if (abs(p) < thresh) sum += taps[k]*p*/
cmp
=
od_cmplt_abs_epi16
(
p
,
thresh
);
p
=
_mm_slli_epi16
(
p
,
1
);
p
=
_mm_and_si128
(
p
,
cmp
);
sum
=
_mm_add_epi16
(
sum
,
p
);
/*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
p
=
_mm_sub_epi16
(
_mm_loadu_si128
((
__m128i
*
)
&
in
[
i
*
OD_FILT_BSTRIDE
-
off2
]),
row
);
/*if (abs(p) < thresh) sum += taps[k]*p1*/
cmp
=
od_cmplt_abs_epi16
(
p
,
thresh
);
p
=
_mm_slli_epi16
(
p
,
1
);
p
=
_mm_and_si128
(
p
,
cmp
);
sum
=
_mm_add_epi16
(
sum
,
p
);
/*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
p
=
_mm_sub_epi16
(
_mm_loadu_si128
((
__m128i
*
)
&
in
[
i
*
OD_FILT_BSTRIDE
+
off3
]),
row
);
/*if (abs(p) < thresh) sum += taps[k]*p*/
cmp
=
od_cmplt_abs_epi16
(
p
,
thresh
);
p
=
_mm_and_si128
(
p
,
cmp
);
sum
=
_mm_add_epi16
(
sum
,
p
);
/*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
p
=
_mm_sub_epi16
(
_mm_loadu_si128
((
__m128i
*
)
&
in
[
i
*
OD_FILT_BSTRIDE
-
off3
]),
row
);
/*if (abs(p) < thresh) sum += taps[k]*p1*/
cmp
=
od_cmplt_abs_epi16
(
p
,
thresh
);
p
=
_mm_and_si128
(
p
,
cmp
);
sum
=
_mm_add_epi16
(
sum
,
p
);
/*res = row + ((sum + 8) >> 4)*/
res
=
_mm_add_epi16
(
sum
,
_mm_set1_epi16
(
8
));
res
=
_mm_srai_epi16
(
res
,
4
);
total_abs
=
_mm_add_epi16
(
total_abs
,
_mm_abs_epi16
(
res
));
res
=
_mm_add_epi16
(
row
,
res
);
_mm_storeu_si128
((
__m128i
*
)
&
y
[
i
*
ystride
],
res
);
}
return
(
hsum_epi16
(
total_abs
)
+
8
)
>>
4
;
}
void
od_filter_dering_orthogonal_4x4_sse4_1
(
int16_t
*
y
,
int
ystride
,
const
int16_t
*
in
,
int
threshold
,
int
dir
)
{
int
i
;
int
offset
;
__m128i
res
;
__m128i
p
;
__m128i
cmp
;
__m128i
row
;
__m128i
sum
;
__m128i
tmp
;
__m128i
thresh
;
thresh
=
_mm_set1_epi16
(
threshold
);
if
(
dir
>
0
&&
dir
<
4
)
offset
=
OD_FILT_BSTRIDE
;
else
offset
=
1
;
for
(
i
=
0
;
i
<
4
;
i
+=
2
)
{
sum
=
_mm_set1_epi16
(
0
);
row
=
_mm_unpacklo_epi64
(
_mm_loadl_epi64
((
__m128i
*
)
&
in
[
i
*
OD_FILT_BSTRIDE
]),
_mm_loadl_epi64
((
__m128i
*
)
&
in
[(
i
+
1
)
*
OD_FILT_BSTRIDE
]));
/*p = in[i*OD_FILT_BSTRIDE + k*offset] - row*/
tmp
=
_mm_unpacklo_epi64
(
_mm_loadl_epi64
((
__m128i
*
)
&
in
[
i
*
OD_FILT_BSTRIDE
+
offset
]),
_mm_loadl_epi64
((
__m128i
*
)
&
in
[(
i
+
1
)
*
OD_FILT_BSTRIDE
+
offset
]));
p
=
_mm_sub_epi16
(
tmp
,
row
);
/*if (abs(p) < threshold) sum += p*/
cmp
=
od_cmplt_abs_epi16
(
p
,
thresh
);
p
=
_mm_and_si128
(
p
,
cmp
);
sum
=
_mm_add_epi16
(
sum
,
p
);
/*p = in[i*OD_FILT_BSTRIDE - k*offset] - row*/
tmp
=
_mm_unpacklo_epi64
(
_mm_loadl_epi64
((
__m128i
*
)
&
in
[
i
*
OD_FILT_BSTRIDE
-
offset
]),
_mm_loadl_epi64
((
__m128i
*
)
&
in
[(
i
+
1
)
*
OD_FILT_BSTRIDE
-
offset
]));
p
=
_mm_sub_epi16
(
tmp
,
row
);
/*if (abs(p) < threshold) sum += p*/
cmp
=
od_cmplt_abs_epi16
(
p
,
thresh
);
p
=
_mm_and_si128
(
p
,
cmp
);
sum
=
_mm_add_epi16
(
sum
,
p
);
/*row + ((5*sum + 8) >> 4)*/
res
=
_mm_mullo_epi16
(
sum
,
_mm_set1_epi16
(
5
));
res
=
_mm_add_epi16
(
res
,
_mm_set1_epi16
(
8
));
res
=
_mm_srai_epi16
(
res
,
4
);
res
=
_mm_add_epi16
(
res
,
row
);
_mm_storel_epi64
((
__m128i
*
)
&
y
[
i
*
ystride
],
res
);