Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
Xiph.Org
aom-rav1e
Commits
6c0f6dd8
Commit
6c0f6dd8
authored
Aug 21, 2015
by
Scott LaVarnway
Committed by
Gerrit Code Review
Aug 21, 2015
Browse files
Options
Browse Files
Download
Plain Diff
Merge "VPX: scaled convolve : fix windows build errors"
parents
f3fa7cfb
acf24cc1
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
79 additions
and
90 deletions
+79
-90
vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+79
-90
No files found.
vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
View file @
6c0f6dd8
...
...
@@ -616,7 +616,7 @@ FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
out7 = _mm_unpackhi_epi64(tr2_3, tr2_3); \
}
static
void
filter_horiz_w8_ssse3
(
const
uint8_t
*
src_x
,
in
t
src_pitch
,
static
void
filter_horiz_w8_ssse3
(
const
uint8_t
*
src_x
,
ptrdiff_
t
src_pitch
,
uint8_t
*
dst
,
const
int16_t
*
x_filter
)
{
const
__m128i
k_256
=
_mm_set1_epi16
(
1
<<
8
);
const
__m128i
f_values
=
_mm_load_si128
((
const
__m128i
*
)
x_filter
);
...
...
@@ -740,18 +740,17 @@ static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
static
void
filter_horiz_w4_ssse3
(
const
uint8_t
*
src_ptr
,
ptrdiff_t
src_pitch
,
uint8_t
*
dst
,
const
int16_t
*
filter
)
{
const
__m64
k_256
=
_mm_set1_pi16
(
1
<<
8
);
const
__m64
f_values_lo
=
*
((
const
__m64
*
)
filter
);
const
__m64
f_values_hi
=
*
((
const
__m64
*
)(
filter
+
4
));
const
__m128i
k_256
=
_mm_set1_epi16
(
1
<<
8
);
const
__m128i
f_values
=
_mm_load_si128
((
const
__m128i
*
)
filter
);
// pack and duplicate the filter values
const
__m
64
f1f0
=
_mm_shuffle_pi8
(
f_values
_lo
,
_mm_set1_pi16
(
0x0200u
));
const
__m
64
f3f2
=
_mm_shuffle_pi8
(
f_values
_lo
,
_mm_set1_pi16
(
0x0604u
));
const
__m
64
f5f4
=
_mm_shuffle_pi8
(
f_values
_hi
,
_mm_set1_pi16
(
0x0a08u
));
const
__m
64
f7f6
=
_mm_shuffle_pi8
(
f_values
_hi
,
_mm_set1_pi16
(
0x0e0cu
));
const
__m
64
A
=
*
((
const
__m
64
*
)
src_ptr
);
const
__m
64
B
=
*
((
const
__m
64
*
)(
src_ptr
+
src_pitch
));
const
__m
64
C
=
*
((
const
__m
64
*
)(
src_ptr
+
src_pitch
*
2
));
const
__m
64
D
=
*
((
const
__m
64
*
)(
src_ptr
+
src_pitch
*
3
));
const
__m
128i
f1f0
=
_mm_shuffle_
e
pi8
(
f_values
,
_mm_set1_
e
pi16
(
0x0200u
));
const
__m
128i
f3f2
=
_mm_shuffle_
e
pi8
(
f_values
,
_mm_set1_
e
pi16
(
0x0604u
));
const
__m
128i
f5f4
=
_mm_shuffle_
e
pi8
(
f_values
,
_mm_set1_
e
pi16
(
0x0a08u
));
const
__m
128i
f7f6
=
_mm_shuffle_
e
pi8
(
f_values
,
_mm_set1_
e
pi16
(
0x0e0cu
));
const
__m
128i
A
=
_mm_loadl_epi64
((
const
__m
128i
*
)
src_ptr
);
const
__m
128i
B
=
_mm_loadl_epi64
((
const
__m
128i
*
)(
src_ptr
+
src_pitch
));
const
__m
128i
C
=
_mm_loadl_epi64
((
const
__m
128i
*
)(
src_ptr
+
src_pitch
*
2
));
const
__m
128i
D
=
_mm_loadl_epi64
((
const
__m
128i
*
)(
src_ptr
+
src_pitch
*
3
));
// TRANSPOSE...
// 00 01 02 03 04 05 06 07
// 10 11 12 13 14 15 16 17
...
...
@@ -769,63 +768,57 @@ static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
// 06 16 26 36
// 07 17 27 37
//
// 00 01 10 11 02 03 12 13
const
__m64
tr0_0
=
_mm_unpacklo_pi16
(
A
,
B
);
// 20 21 30 31 22 23 32 33
const
__m64
tr0_1
=
_mm_unpacklo_pi16
(
C
,
D
);
// 04 05 14 15 06 07 16 17
const
__m64
tr0_2
=
_mm_unpackhi_pi16
(
A
,
B
);
// 24 25 34 35 26 27 36 37
const
__m64
tr0_3
=
_mm_unpackhi_pi16
(
C
,
D
);
// 00 01 10 11 20 21 30 31
const
__m64
s1s0
=
_mm_unpacklo_pi32
(
tr0_0
,
tr0_1
);
// 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
const
__m128i
tr0_0
=
_mm_unpacklo_epi16
(
A
,
B
);
// 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
const
__m128i
tr0_1
=
_mm_unpacklo_epi16
(
C
,
D
);
// 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
const
__m128i
s1s0
=
_mm_unpacklo_epi32
(
tr0_0
,
tr0_1
);
// 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
const
__m128i
s5s4
=
_mm_unpackhi_epi32
(
tr0_0
,
tr0_1
);
// 02 03 12 13 22 23 32 33
const
__m64
s3s2
=
_mm_unpackhi_pi32
(
tr0_0
,
tr0_1
);
// 04 05 14 15 24 25 34 35
const
__m64
s5s4
=
_mm_unpacklo_pi32
(
tr0_2
,
tr0_3
);
const
__m128i
s3s2
=
_mm_srli_si128
(
s1s0
,
8
);
// 06 07 16 17 26 27 36 37
const
__m
64
s7s6
=
_mm_
unpackhi_pi32
(
tr0_2
,
tr0_3
);
const
__m
128i
s7s6
=
_mm_
srli_si128
(
s5s4
,
8
);
// multiply 2 adjacent elements with the filter and add the result
const
__m
64
x0
=
_mm_maddubs_pi16
(
s1s0
,
f1f0
);
const
__m
64
x1
=
_mm_maddubs_pi16
(
s3s2
,
f3f2
);
const
__m
64
x2
=
_mm_maddubs_pi16
(
s5s4
,
f5f4
);
const
__m
64
x3
=
_mm_maddubs_pi16
(
s7s6
,
f7f6
);
const
__m
128i
x0
=
_mm_maddubs_
e
pi16
(
s1s0
,
f1f0
);
const
__m
128i
x1
=
_mm_maddubs_
e
pi16
(
s3s2
,
f3f2
);
const
__m
128i
x2
=
_mm_maddubs_
e
pi16
(
s5s4
,
f5f4
);
const
__m
128i
x3
=
_mm_maddubs_
e
pi16
(
s7s6
,
f7f6
);
// add and saturate the results together
const
__m
64
min_x2x1
=
_mm_min_pi16
(
x2
,
x1
);
const
__m
64
max_x2x1
=
_mm_max_pi16
(
x2
,
x1
);
__m
64
temp
=
_mm_adds_pi16
(
x0
,
x3
);
temp
=
_mm_adds_pi16
(
temp
,
min_x2x1
);
temp
=
_mm_adds_pi16
(
temp
,
max_x2x1
);
const
__m
128i
min_x2x1
=
_mm_min_
e
pi16
(
x2
,
x1
);
const
__m
128i
max_x2x1
=
_mm_max_
e
pi16
(
x2
,
x1
);
__m
128i
temp
=
_mm_adds_
e
pi16
(
x0
,
x3
);
temp
=
_mm_adds_
e
pi16
(
temp
,
min_x2x1
);
temp
=
_mm_adds_
e
pi16
(
temp
,
max_x2x1
);
// round and shift by 7 bit each 16 bit
temp
=
_mm_mulhrs_pi16
(
temp
,
k_256
);
temp
=
_mm_mulhrs_
e
pi16
(
temp
,
k_256
);
// shrink to 8 bit each 16 bits
temp
=
_mm_packs_
pu
16
(
temp
,
temp
);
temp
=
_mm_pack
u
s_
epi
16
(
temp
,
temp
);
// save only 4 bytes
*
(
int
*
)
dst
=
_mm_cvtsi
64
_si32
(
temp
);
*
(
int
*
)
dst
=
_mm_cvtsi
128
_si32
(
temp
);
}
static
void
transpose4x4_to_dst
(
const
uint8_t
*
src
,
ptrdiff_t
src_stride
,
uint8_t
*
dst
,
ptrdiff_t
dst_stride
)
{
__m
64
A
=
_mm_cvtsi32_si
64
(
*
(
const
int
*
)
src
);
__m
64
B
=
_mm_cvtsi32_si
64
(
*
(
const
int
*
)(
src
+
src_stride
));
__m
64
C
=
_mm_cvtsi32_si
64
(
*
(
const
int
*
)(
src
+
src_stride
*
2
));
__m
64
D
=
_mm_cvtsi32_si
64
(
*
(
const
int
*
)(
src
+
src_stride
*
3
));
__m
128i
A
=
_mm_cvtsi32_si
128
(
*
(
const
int
*
)
src
);
__m
128i
B
=
_mm_cvtsi32_si
128
(
*
(
const
int
*
)(
src
+
src_stride
));
__m
128i
C
=
_mm_cvtsi32_si
128
(
*
(
const
int
*
)(
src
+
src_stride
*
2
));
__m
128i
D
=
_mm_cvtsi32_si
128
(
*
(
const
int
*
)(
src
+
src_stride
*
3
));
// 00 10 01 11 02 12 03 13
const
__m
64
tr0_0
=
_mm_unpacklo_pi8
(
A
,
B
);
const
__m
128i
tr0_0
=
_mm_unpacklo_
e
pi8
(
A
,
B
);
// 20 30 21 31 22 32 23 33
const
__m64
tr0_1
=
_mm_unpacklo_pi8
(
C
,
D
);
// 00 10 20 30 01 11 21 31
A
=
_mm_unpacklo_pi16
(
tr0_0
,
tr0_1
);
// 02 12 22 32 03 13 23 33
C
=
_mm_unpackhi_pi16
(
tr0_0
,
tr0_1
);
B
=
_mm_unpackhi_pi32
(
A
,
A
);
D
=
_mm_unpackhi_pi32
(
C
,
C
);
*
(
int
*
)(
dst
)
=
_mm_cvtsi64_si32
(
A
);
*
(
int
*
)(
dst
+
dst_stride
)
=
_mm_cvtsi64_si32
(
B
);
*
(
int
*
)(
dst
+
dst_stride
*
2
)
=
_mm_cvtsi64_si32
(
C
);
*
(
int
*
)(
dst
+
dst_stride
*
3
)
=
_mm_cvtsi64_si32
(
D
);
const
__m128i
tr0_1
=
_mm_unpacklo_epi8
(
C
,
D
);
// 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
A
=
_mm_unpacklo_epi16
(
tr0_0
,
tr0_1
);
B
=
_mm_srli_si128
(
A
,
4
);
C
=
_mm_srli_si128
(
A
,
8
);
D
=
_mm_srli_si128
(
A
,
12
);
*
(
int
*
)(
dst
)
=
_mm_cvtsi128_si32
(
A
);
*
(
int
*
)(
dst
+
dst_stride
)
=
_mm_cvtsi128_si32
(
B
);
*
(
int
*
)(
dst
+
dst_stride
*
2
)
=
_mm_cvtsi128_si32
(
C
);
*
(
int
*
)(
dst
+
dst_stride
*
3
)
=
_mm_cvtsi128_si32
(
D
);
}
static
void
scaledconvolve_horiz_w4
(
const
uint8_t
*
src
,
ptrdiff_t
src_stride
,
...
...
@@ -865,46 +858,42 @@ static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
static
void
filter_vert_w4_ssse3
(
const
uint8_t
*
src_ptr
,
ptrdiff_t
src_pitch
,
uint8_t
*
dst
,
const
int16_t
*
filter
)
{
const
__m64
k_256
=
_mm_set1_pi16
(
1
<<
8
);
const
__m64
f_values_lo
=
*
((
const
__m64
*
)
filter
);
const
__m64
f_values_hi
=
*
((
const
__m64
*
)(
filter
+
4
));
const
__m128i
k_256
=
_mm_set1_epi16
(
1
<<
8
);
const
__m128i
f_values
=
_mm_load_si128
((
const
__m128i
*
)
filter
);
// pack and duplicate the filter values
const
__m64
f1f0
=
_mm_shuffle_pi8
(
f_values_lo
,
_mm_set1_pi16
(
0x0200u
));
const
__m64
f3f2
=
_mm_shuffle_pi8
(
f_values_lo
,
_mm_set1_pi16
(
0x0604u
));
const
__m64
f5f4
=
_mm_shuffle_pi8
(
f_values_hi
,
_mm_set1_pi16
(
0x0a08u
));
const
__m64
f7f6
=
_mm_shuffle_pi8
(
f_values_hi
,
_mm_set1_pi16
(
0x0e0cu
));
const
__m64
A
=
_mm_cvtsi32_si64
(
*
(
const
int
*
)
src_ptr
);
const
__m64
B
=
_mm_cvtsi32_si64
(
*
(
const
int
*
)(
src_ptr
+
src_pitch
));
const
__m64
C
=
_mm_cvtsi32_si64
(
*
(
const
int
*
)(
src_ptr
+
src_pitch
*
2
));
const
__m64
D
=
_mm_cvtsi32_si64
(
*
(
const
int
*
)(
src_ptr
+
src_pitch
*
3
));
const
__m64
E
=
_mm_cvtsi32_si64
(
*
(
const
int
*
)(
src_ptr
+
src_pitch
*
4
));
const
__m64
F
=
_mm_cvtsi32_si64
(
*
(
const
int
*
)(
src_ptr
+
src_pitch
*
5
));
const
__m64
G
=
_mm_cvtsi32_si64
(
*
(
const
int
*
)(
src_ptr
+
src_pitch
*
6
));
const
__m64
H
=
_mm_cvtsi32_si64
(
*
(
const
int
*
)(
src_ptr
+
src_pitch
*
7
));
const
__m64
s1s0
=
_mm_unpacklo_pi8
(
A
,
B
);
// 02 03 12 13 22 23 32 33
const
__m64
s3s2
=
_mm_unpacklo_pi8
(
C
,
D
);
// 04 05 14 15 24 25 34 35
const
__m64
s5s4
=
_mm_unpacklo_pi8
(
E
,
F
);
// 06 07 16 17 26 27 36 37
const
__m64
s7s6
=
_mm_unpacklo_pi8
(
G
,
H
);
const
__m128i
f1f0
=
_mm_shuffle_epi8
(
f_values
,
_mm_set1_epi16
(
0x0200u
));
const
__m128i
f3f2
=
_mm_shuffle_epi8
(
f_values
,
_mm_set1_epi16
(
0x0604u
));
const
__m128i
f5f4
=
_mm_shuffle_epi8
(
f_values
,
_mm_set1_epi16
(
0x0a08u
));
const
__m128i
f7f6
=
_mm_shuffle_epi8
(
f_values
,
_mm_set1_epi16
(
0x0e0cu
));
const
__m128i
A
=
_mm_cvtsi32_si128
(
*
(
const
int
*
)
src_ptr
);
const
__m128i
B
=
_mm_cvtsi32_si128
(
*
(
const
int
*
)(
src_ptr
+
src_pitch
));
const
__m128i
C
=
_mm_cvtsi32_si128
(
*
(
const
int
*
)(
src_ptr
+
src_pitch
*
2
));
const
__m128i
D
=
_mm_cvtsi32_si128
(
*
(
const
int
*
)(
src_ptr
+
src_pitch
*
3
));
const
__m128i
E
=
_mm_cvtsi32_si128
(
*
(
const
int
*
)(
src_ptr
+
src_pitch
*
4
));
const
__m128i
F
=
_mm_cvtsi32_si128
(
*
(
const
int
*
)(
src_ptr
+
src_pitch
*
5
));
const
__m128i
G
=
_mm_cvtsi32_si128
(
*
(
const
int
*
)(
src_ptr
+
src_pitch
*
6
));
const
__m128i
H
=
_mm_cvtsi32_si128
(
*
(
const
int
*
)(
src_ptr
+
src_pitch
*
7
));
const
__m128i
s1s0
=
_mm_unpacklo_epi8
(
A
,
B
);
const
__m128i
s3s2
=
_mm_unpacklo_epi8
(
C
,
D
);
const
__m128i
s5s4
=
_mm_unpacklo_epi8
(
E
,
F
);
const
__m128i
s7s6
=
_mm_unpacklo_epi8
(
G
,
H
);
// multiply 2 adjacent elements with the filter and add the result
const
__m
64
x0
=
_mm_maddubs_pi16
(
s1s0
,
f1f0
);
const
__m
64
x1
=
_mm_maddubs_pi16
(
s3s2
,
f3f2
);
const
__m
64
x2
=
_mm_maddubs_pi16
(
s5s4
,
f5f4
);
const
__m
64
x3
=
_mm_maddubs_pi16
(
s7s6
,
f7f6
);
const
__m
128i
x0
=
_mm_maddubs_
e
pi16
(
s1s0
,
f1f0
);
const
__m
128i
x1
=
_mm_maddubs_
e
pi16
(
s3s2
,
f3f2
);
const
__m
128i
x2
=
_mm_maddubs_
e
pi16
(
s5s4
,
f5f4
);
const
__m
128i
x3
=
_mm_maddubs_
e
pi16
(
s7s6
,
f7f6
);
// add and saturate the results together
const
__m
64
min_x2x1
=
_mm_min_pi16
(
x2
,
x1
);
const
__m
64
max_x2x1
=
_mm_max_pi16
(
x2
,
x1
);
__m
64
temp
=
_mm_adds_pi16
(
x0
,
x3
);
temp
=
_mm_adds_pi16
(
temp
,
min_x2x1
);
temp
=
_mm_adds_pi16
(
temp
,
max_x2x1
);
const
__m
128i
min_x2x1
=
_mm_min_
e
pi16
(
x2
,
x1
);
const
__m
128i
max_x2x1
=
_mm_max_
e
pi16
(
x2
,
x1
);
__m
128i
temp
=
_mm_adds_
e
pi16
(
x0
,
x3
);
temp
=
_mm_adds_
e
pi16
(
temp
,
min_x2x1
);
temp
=
_mm_adds_
e
pi16
(
temp
,
max_x2x1
);
// round and shift by 7 bit each 16 bit
temp
=
_mm_mulhrs_pi16
(
temp
,
k_256
);
temp
=
_mm_mulhrs_
e
pi16
(
temp
,
k_256
);
// shrink to 8 bit each 16 bits
temp
=
_mm_packs_
pu
16
(
temp
,
temp
);
temp
=
_mm_pack
u
s_
epi
16
(
temp
,
temp
);
// save only 4 bytes
*
(
int
*
)
dst
=
_mm_cvtsi
64
_si32
(
temp
);
*
(
int
*
)
dst
=
_mm_cvtsi
128
_si32
(
temp
);
}
static
void
scaledconvolve_vert_w4
(
const
uint8_t
*
src
,
ptrdiff_t
src_stride
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment