Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
4580f337
Commit
4580f337
authored
Aug 21, 2012
by
Christian Duvivier
Committed by
Gerrit Code Review
Aug 21, 2012
Browse files
Merge "SSE2 version of vectorized 8-tap filtering." into experimental
parents
1b5e5e92
63ef9c40
Changes
4
Hide whitespace changes
Inline
Side-by-side
vp8/common/rtcd_defs.sh
View file @
4580f337
...
...
@@ -14,8 +14,8 @@ prototype void vp8_filter_block2d_16x16_8 "const unsigned char *src_ptr, const u
# compiles warning free but a dissassembly of generated code show bugs. To be
# on the safe side, only enabled when compiled with 'gcc'.
if
[
"
$CONFIG_GCC
"
=
"yes"
]
;
then
specialize vp8_filter_block2d_4x4_8 sse4_1
specialize vp8_filter_block2d_8x4_8 sse4_1
specialize vp8_filter_block2d_8x8_8 sse4_1
specialize vp8_filter_block2d_16x16_8 sse4_1
specialize vp8_filter_block2d_4x4_8 sse4_1
sse2
specialize vp8_filter_block2d_8x4_8 sse4_1
sse2
specialize vp8_filter_block2d_8x8_8 sse4_1
sse2
specialize vp8_filter_block2d_16x16_8 sse4_1
sse2
fi
vp8/common/x86/filter_sse2.c
0 → 100644
View file @
4580f337
/*
* Copyright (c) 2012 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <assert.h> // for alignment checks
#include <emmintrin.h> // SSE2
#include "vp8/common/filter.h"
#include "vpx_ports/mem.h" // for DECLARE_ALIGNED
#include "vpx_rtcd.h"
// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is
// just a quick partial snapshot so that other can already use some
// speedup.
// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap
// filtering.
// TODO(cd): Add some comments, better variable naming.
// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum
// of positive above 128), or have higher precision filter
// coefficients.
DECLARE_ALIGNED
(
16
,
static
const
unsigned
int
,
rounding_c
[
4
])
=
{
VP8_FILTER_WEIGHT
>>
1
,
VP8_FILTER_WEIGHT
>>
1
,
VP8_FILTER_WEIGHT
>>
1
,
VP8_FILTER_WEIGHT
>>
1
,
};
// Creating a macro to do more than four pixels at once to hide instruction
// latency is actually slower :-(
#define DO_FOUR_PIXELS(result, src_ptr, offset) \
{ \
/* Do shifted load to achieve require shuffles through unpacking */
\
const __m128i src0 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 0)); \
const __m128i src1 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 1)); \
const __m128i src2 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 2)); \
const __m128i src3 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 3)); \
const __m128i src01 = _mm_unpacklo_epi8(src0, src1); \
const __m128i src01_16 = _mm_unpacklo_epi8(src01, zero); \
const __m128i src23 = _mm_unpacklo_epi8(src2, src3); \
const __m128i src23_16 = _mm_unpacklo_epi8(src23, zero); \
/* Shit by 4 bytes through suffle to get additional shifted loads */
\
const __m128i src4 = _mm_shuffle_epi32(src0, _MM_SHUFFLE(3, 3, 2, 1)); \
const __m128i src5 = _mm_shuffle_epi32(src1, _MM_SHUFFLE(3, 3, 2, 1)); \
const __m128i src6 = _mm_shuffle_epi32(src2, _MM_SHUFFLE(3, 3, 2, 1)); \
const __m128i src7 = _mm_shuffle_epi32(src3, _MM_SHUFFLE(3, 3, 2, 1)); \
const __m128i src45 = _mm_unpacklo_epi8(src4, src5); \
const __m128i src45_16 = _mm_unpacklo_epi8(src45, zero); \
const __m128i src67 = _mm_unpacklo_epi8(src6, src7); \
const __m128i src67_16 = _mm_unpacklo_epi8(src67, zero); \
/* multiply accumulate them */
\
const __m128i mad01 = _mm_madd_epi16(src01_16, fil01); \
const __m128i mad23 = _mm_madd_epi16(src23_16, fil23); \
const __m128i mad45 = _mm_madd_epi16(src45_16, fil45); \
const __m128i mad67 = _mm_madd_epi16(src67_16, fil67); \
const __m128i mad0123 = _mm_add_epi32(mad01, mad23); \
const __m128i mad4567 = _mm_add_epi32(mad45, mad67); \
__m128i mad_all = _mm_add_epi32(mad0123, mad4567); \
mad_all = _mm_add_epi32(mad_all, rounding); \
result = _mm_srai_epi32(mad_all, VP8_FILTER_SHIFT); \
}
void
vp8_filter_block2d_4x4_8_sse2
(
const
unsigned
char
*
src_ptr
,
const
unsigned
int
src_stride
,
const
short
*
HFilter_aligned16
,
const
short
*
VFilter_aligned16
,
unsigned
char
*
dst_ptr
,
unsigned
int
dst_stride
)
{
__m128i
intermediateA
,
intermediateB
,
intermediateC
;
const
int
kInterp_Extend
=
4
;
const
__m128i
zero
=
_mm_set1_epi16
(
0
);
const
__m128i
rounding
=
_mm_load_si128
((
const
__m128i
*
)
rounding_c
);
// check alignment
assert
(
0
==
((
long
)
HFilter_aligned16
)
%
16
);
assert
(
0
==
((
long
)
VFilter_aligned16
)
%
16
);
{
__m128i
transpose3_0
;
__m128i
transpose3_1
;
__m128i
transpose3_2
;
__m128i
transpose3_3
;
// Horizontal pass (src -> intermediate).
{
const
__m128i
HFilter
=
_mm_load_si128
((
const
__m128i
*
)
HFilter_aligned16
);
// get first two columns filter coefficients
__m128i
fil01
=
_mm_shuffle_epi32
(
HFilter
,
_MM_SHUFFLE
(
0
,
0
,
0
,
0
));
__m128i
fil23
=
_mm_shuffle_epi32
(
HFilter
,
_MM_SHUFFLE
(
1
,
1
,
1
,
1
));
__m128i
fil45
=
_mm_shuffle_epi32
(
HFilter
,
_MM_SHUFFLE
(
2
,
2
,
2
,
2
));
__m128i
fil67
=
_mm_shuffle_epi32
(
HFilter
,
_MM_SHUFFLE
(
3
,
3
,
3
,
3
));
src_ptr
-=
(
kInterp_Extend
-
1
)
*
src_stride
+
(
kInterp_Extend
-
1
);
{
__m128i
mad_all0
;
__m128i
mad_all1
;
__m128i
mad_all2
;
__m128i
mad_all3
;
DO_FOUR_PIXELS
(
mad_all0
,
src_ptr
,
0
*
src_stride
)
DO_FOUR_PIXELS
(
mad_all1
,
src_ptr
,
1
*
src_stride
)
DO_FOUR_PIXELS
(
mad_all2
,
src_ptr
,
2
*
src_stride
)
DO_FOUR_PIXELS
(
mad_all3
,
src_ptr
,
3
*
src_stride
)
mad_all0
=
_mm_packs_epi32
(
mad_all0
,
mad_all1
);
mad_all2
=
_mm_packs_epi32
(
mad_all2
,
mad_all3
);
intermediateA
=
_mm_packus_epi16
(
mad_all0
,
mad_all2
);
// --
src_ptr
+=
src_stride
*
4
;
// --
DO_FOUR_PIXELS
(
mad_all0
,
src_ptr
,
0
*
src_stride
)
DO_FOUR_PIXELS
(
mad_all1
,
src_ptr
,
1
*
src_stride
)
DO_FOUR_PIXELS
(
mad_all2
,
src_ptr
,
2
*
src_stride
)
DO_FOUR_PIXELS
(
mad_all3
,
src_ptr
,
3
*
src_stride
)
mad_all0
=
_mm_packs_epi32
(
mad_all0
,
mad_all1
);
mad_all2
=
_mm_packs_epi32
(
mad_all2
,
mad_all3
);
intermediateB
=
_mm_packus_epi16
(
mad_all0
,
mad_all2
);
// --
src_ptr
+=
src_stride
*
4
;
// --
DO_FOUR_PIXELS
(
mad_all0
,
src_ptr
,
0
*
src_stride
)
DO_FOUR_PIXELS
(
mad_all1
,
src_ptr
,
1
*
src_stride
)
DO_FOUR_PIXELS
(
mad_all2
,
src_ptr
,
2
*
src_stride
)
mad_all0
=
_mm_packs_epi32
(
mad_all0
,
mad_all1
);
mad_all2
=
_mm_packs_epi32
(
mad_all2
,
mad_all2
);
intermediateC
=
_mm_packus_epi16
(
mad_all0
,
mad_all2
);
}
}
// Transpose result (intermediate -> transpose3_x)
{
// 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
// 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73
// 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx
const
__m128i
transpose0_0
=
_mm_unpacklo_epi8
(
intermediateA
,
intermediateB
);
const
__m128i
transpose0_1
=
_mm_unpackhi_epi8
(
intermediateA
,
intermediateB
);
const
__m128i
transpose0_2
=
_mm_unpacklo_epi8
(
intermediateC
,
intermediateC
);
const
__m128i
transpose0_3
=
_mm_unpackhi_epi8
(
intermediateC
,
intermediateC
);
// 00 40 01 41 02 42 03 43 10 50 11 51 12 52 13 53
// 20 60 21 61 22 62 23 63 30 70 31 71 32 72 33 73
// 80 xx 81 xx 82 xx 83 xx 90 xx 91 xx 92 xx 93 xx
// A0 xx A1 xx A2 xx A3 xx xx xx xx xx xx xx xx xx
const
__m128i
transpose1_0
=
_mm_unpacklo_epi8
(
transpose0_0
,
transpose0_1
);
const
__m128i
transpose1_1
=
_mm_unpackhi_epi8
(
transpose0_0
,
transpose0_1
);
const
__m128i
transpose1_2
=
_mm_unpacklo_epi8
(
transpose0_2
,
transpose0_3
);
const
__m128i
transpose1_3
=
_mm_unpackhi_epi8
(
transpose0_2
,
transpose0_3
);
// 00 20 40 60 01 21 41 61 02 22 42 62 03 23 43 63
// 10 30 50 70 11 31 51 71 12 32 52 72 13 33 53 73
// 80 A0 xx xx 81 A1 xx xx 82 A2 xx xx 83 A3 xx xx
// 90 xx xx xx 91 xx xx xx 92 xx xx xx 93 xx xx xx
const
__m128i
transpose2_0
=
_mm_unpacklo_epi8
(
transpose1_0
,
transpose1_1
);
const
__m128i
transpose2_1
=
_mm_unpackhi_epi8
(
transpose1_0
,
transpose1_1
);
const
__m128i
transpose2_2
=
_mm_unpacklo_epi8
(
transpose1_2
,
transpose1_3
);
const
__m128i
transpose2_3
=
_mm_unpackhi_epi8
(
transpose1_2
,
transpose1_3
);
// 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
// 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
// 80 90 A0 xx xx xx xx xx 81 91 A1 xx xx xx xx xx
// 82 92 A2 xx xx xx xx xx 83 93 A3 xx xx xx xx xx
transpose3_0
=
_mm_castps_si128
(
_mm_shuffle_ps
(
_mm_castsi128_ps
(
transpose2_0
),
_mm_castsi128_ps
(
transpose2_2
),
_MM_SHUFFLE
(
1
,
0
,
1
,
0
)));
transpose3_1
=
_mm_castps_si128
(
_mm_shuffle_ps
(
_mm_castsi128_ps
(
transpose2_0
),
_mm_castsi128_ps
(
transpose2_2
),
_MM_SHUFFLE
(
3
,
2
,
3
,
2
)));
transpose3_2
=
_mm_castps_si128
(
_mm_shuffle_ps
(
_mm_castsi128_ps
(
transpose2_1
),
_mm_castsi128_ps
(
transpose2_3
),
_MM_SHUFFLE
(
1
,
0
,
1
,
0
)));
transpose3_3
=
_mm_castps_si128
(
_mm_shuffle_ps
(
_mm_castsi128_ps
(
transpose2_1
),
_mm_castsi128_ps
(
transpose2_3
),
_MM_SHUFFLE
(
3
,
2
,
3
,
2
)));
// 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx
// 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx
// 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx
// 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx
}
// Vertical pass (transpose3_x -> dst).
{
const
__m128i
VFilter
=
_mm_load_si128
((
const
__m128i
*
)
VFilter_aligned16
);
// get first two columns filter coefficients
__m128i
fil01
=
_mm_shuffle_epi32
(
VFilter
,
_MM_SHUFFLE
(
0
,
0
,
0
,
0
));
__m128i
fil23
=
_mm_shuffle_epi32
(
VFilter
,
_MM_SHUFFLE
(
1
,
1
,
1
,
1
));
__m128i
fil45
=
_mm_shuffle_epi32
(
VFilter
,
_MM_SHUFFLE
(
2
,
2
,
2
,
2
));
__m128i
fil67
=
_mm_shuffle_epi32
(
VFilter
,
_MM_SHUFFLE
(
3
,
3
,
3
,
3
));
__m128i
col0
,
col1
,
col2
,
col3
;
DECLARE_ALIGNED
(
16
,
unsigned
char
,
temp
[
32
]);
{
_mm_store_si128
((
__m128i
*
)
temp
,
transpose3_0
);
DO_FOUR_PIXELS
(
col0
,
temp
,
0
);
}
{
_mm_store_si128
((
__m128i
*
)
temp
,
transpose3_1
);
DO_FOUR_PIXELS
(
col1
,
temp
,
0
);
}
{
_mm_store_si128
((
__m128i
*
)
temp
,
transpose3_2
);
DO_FOUR_PIXELS
(
col2
,
temp
,
0
);
}
{
_mm_store_si128
((
__m128i
*
)
temp
,
transpose3_3
);
DO_FOUR_PIXELS
(
col3
,
temp
,
0
);
}
// transpose
{
__m128i
T0
=
_mm_unpacklo_epi32
(
col0
,
col1
);
__m128i
T1
=
_mm_unpacklo_epi32
(
col2
,
col3
);
__m128i
T2
=
_mm_unpackhi_epi32
(
col0
,
col1
);
__m128i
T3
=
_mm_unpackhi_epi32
(
col2
,
col3
);
col0
=
_mm_unpacklo_epi64
(
T0
,
T1
);
col1
=
_mm_unpackhi_epi64
(
T0
,
T1
);
col2
=
_mm_unpacklo_epi64
(
T2
,
T3
);
col3
=
_mm_unpackhi_epi64
(
T2
,
T3
);
}
// saturate to 8 bit
{
col0
=
_mm_packs_epi32
(
col0
,
col0
);
col0
=
_mm_packus_epi16
(
col0
,
col0
);
col1
=
_mm_packs_epi32
(
col1
,
col1
);
col1
=
_mm_packus_epi16
(
col1
,
col1
);
col2
=
_mm_packs_epi32
(
col2
,
col2
);
col2
=
_mm_packus_epi16
(
col2
,
col2
);
col3
=
_mm_packs_epi32
(
col3
,
col3
);
col3
=
_mm_packus_epi16
(
col3
,
col3
);
}
// store
{
*
((
unsigned
int
*
)
&
dst_ptr
[
dst_stride
*
0
])
=
_mm_cvtsi128_si32
(
col0
);
*
((
unsigned
int
*
)
&
dst_ptr
[
dst_stride
*
1
])
=
_mm_cvtsi128_si32
(
col1
);
*
((
unsigned
int
*
)
&
dst_ptr
[
dst_stride
*
2
])
=
_mm_cvtsi128_si32
(
col2
);
*
((
unsigned
int
*
)
&
dst_ptr
[
dst_stride
*
3
])
=
_mm_cvtsi128_si32
(
col3
);
}
}
}
}
void
vp8_filter_block2d_8x4_8_sse2
(
const
unsigned
char
*
src_ptr
,
const
unsigned
int
src_stride
,
const
short
*
HFilter_aligned16
,
const
short
*
VFilter_aligned16
,
unsigned
char
*
dst_ptr
,
unsigned
int
dst_stride
)
{
int
j
;
for
(
j
=
0
;
j
<
8
;
j
+=
4
)
{
vp8_filter_block2d_4x4_8_sse2
(
src_ptr
+
j
,
src_stride
,
HFilter_aligned16
,
VFilter_aligned16
,
dst_ptr
+
j
,
dst_stride
);
}
}
void
vp8_filter_block2d_8x8_8_sse2
(
const
unsigned
char
*
src_ptr
,
const
unsigned
int
src_stride
,
const
short
*
HFilter_aligned16
,
const
short
*
VFilter_aligned16
,
unsigned
char
*
dst_ptr
,
unsigned
int
dst_stride
)
{
int
i
,
j
;
for
(
i
=
0
;
i
<
8
;
i
+=
4
)
{
for
(
j
=
0
;
j
<
8
;
j
+=
4
)
{
vp8_filter_block2d_4x4_8_sse2
(
src_ptr
+
j
+
i
*
src_stride
,
src_stride
,
HFilter_aligned16
,
VFilter_aligned16
,
dst_ptr
+
j
+
i
*
dst_stride
,
dst_stride
);
}
}
}
void
vp8_filter_block2d_16x16_8_sse2
(
const
unsigned
char
*
src_ptr
,
const
unsigned
int
src_stride
,
const
short
*
HFilter_aligned16
,
const
short
*
VFilter_aligned16
,
unsigned
char
*
dst_ptr
,
unsigned
int
dst_stride
)
{
int
i
,
j
;
for
(
i
=
0
;
i
<
16
;
i
+=
4
)
{
for
(
j
=
0
;
j
<
16
;
j
+=
4
)
{
vp8_filter_block2d_4x4_8_sse2
(
src_ptr
+
j
+
i
*
src_stride
,
src_stride
,
HFilter_aligned16
,
VFilter_aligned16
,
dst_ptr
+
j
+
i
*
dst_stride
,
dst_stride
);
}
}
}
vp8/common/x86/filter_sse4.c
View file @
4580f337
...
...
@@ -25,9 +25,6 @@
// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum
// of positive above 128), or have higher precision filter
// coefficients.
// TODO(cd): Remove use of _mm_extract_epi32 and _mm_extract_epi64, to not
// require SSE4.1
// TODO(cd): Remove use of _mm_shuffle_epi8 to not require SSSE3
DECLARE_ALIGNED
(
16
,
static
const
unsigned
char
,
mask0123_c
[
16
])
=
{
0x00
,
0x01
,
...
...
vp8/vp8_common.mk
View file @
4580f337
...
...
@@ -119,6 +119,11 @@ ifeq ($(HAVE_SSE4_1),yes)
vp8/common/x86/filter_sse4.c.o
:
CFLAGS += -msse4
endif
VP8_COMMON_SRCS-$(HAVE_SSE2)
+=
common/x86/filter_sse2.c
ifeq
($(HAVE_SSE2),yes)
vp8/common/x86/filter_sse2.c.o
:
CFLAGS += -msse2
endif
VP8_COMMON_SRCS-$(ARCH_ARM)
+=
common/arm/arm_systemdependent.c
VP8_COMMON_SRCS-$(ARCH_ARM)
+=
common/arm/bilinearfilter_arm.c
VP8_COMMON_SRCS-$(ARCH_ARM)
+=
common/arm/bilinearfilter_arm.h
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment