Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
78842b28
Commit
78842b28
authored
Jun 22, 2016
by
Debargha Mukherjee
Committed by
Gerrit Code Review
Jun 22, 2016
Browse files
Merge "Reinstate "Optimize wedge partition selection." without tests." into nextgenv2
parents
c797e709
135d6631
Changes
7
Expand all
Hide whitespace changes
Inline
Side-by-side
vp10/common/reconinter.c
View file @
78842b28
...
...
@@ -2447,7 +2447,6 @@ static void build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, int plane,
int
wedge_offset_x
,
int
wedge_offset_y
,
#endif // CONFIG_SUPERTX
int
mi_x
,
int
mi_y
,
uint8_t
*
ext_dst0
,
int
ext_dst_stride0
,
uint8_t
*
ext_dst1
,
...
...
@@ -2461,8 +2460,6 @@ static void build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, int plane,
(
void
)
block
;
(
void
)
bw
;
(
void
)
bh
;
(
void
)
mi_x
;
(
void
)
mi_y
;
if
(
is_compound
&&
is_interinter_wedge_used
(
mbmi
->
sb_type
)
...
...
@@ -2526,12 +2523,9 @@ static void build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, int plane,
void
vp10_build_wedge_inter_predictor_from_buf
(
MACROBLOCKD
*
xd
,
BLOCK_SIZE
bsize
,
int
plane_from
,
int
plane_to
,
int
mi_row
,
int
mi_col
,
uint8_t
*
ext_dst0
[
3
],
int
ext_dst_stride0
[
3
],
uint8_t
*
ext_dst1
[
3
],
int
ext_dst_stride1
[
3
])
{
int
plane
;
const
int
mi_x
=
mi_col
*
MI_SIZE
;
const
int
mi_y
=
mi_row
*
MI_SIZE
;
for
(
plane
=
plane_from
;
plane
<=
plane_to
;
++
plane
)
{
const
BLOCK_SIZE
plane_bsize
=
get_plane_block_size
(
bsize
,
&
xd
->
plane
[
plane
]);
...
...
@@ -2550,7 +2544,6 @@ void vp10_build_wedge_inter_predictor_from_buf(
#if CONFIG_SUPERTX
0
,
0
,
#endif
mi_x
,
mi_y
,
ext_dst0
[
plane
],
ext_dst_stride0
[
plane
],
ext_dst1
[
plane
],
...
...
@@ -2561,7 +2554,6 @@ void vp10_build_wedge_inter_predictor_from_buf(
#if CONFIG_SUPERTX
0
,
0
,
#endif
mi_x
,
mi_y
,
ext_dst0
[
plane
],
ext_dst_stride0
[
plane
],
ext_dst1
[
plane
],
...
...
vp10/common/reconinter.h
View file @
78842b28
...
...
@@ -652,7 +652,6 @@ void vp10_build_inter_predictors_for_planes_single_buf(
void
vp10_build_wedge_inter_predictor_from_buf
(
MACROBLOCKD
*
xd
,
BLOCK_SIZE
bsize
,
int
plane_from
,
int
plane_to
,
int
mi_row
,
int
mi_col
,
uint8_t
*
ext_dst0
[
3
],
int
ext_dst_stride0
[
3
],
uint8_t
*
ext_dst1
[
3
],
int
ext_dst_stride1
[
3
]);
#endif // CONFIG_EXT_INTER
...
...
vp10/common/vp10_rtcd_defs.pl
View file @
78842b28
...
...
@@ -725,6 +725,15 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
}
# End vp10_high encoder functions
if
(
vpx_config
("
CONFIG_EXT_INTER
")
eq
"
yes
")
{
add_proto
qw/uint64_t vp10_wedge_sse_from_residuals/
,
"
const int16_t *r1, const int16_t *d, const uint8_t *m, int N
";
specialize
qw/vp10_wedge_sse_from_residuals sse2/
;
add_proto
qw/int vp10_wedge_sign_from_residuals/
,
"
const int16_t *ds, const uint8_t *m, int N, int64_t limit
";
specialize
qw/vp10_wedge_sign_from_residuals sse2/
;
add_proto
qw/void vp10_wedge_compute_delta_squares/
,
"
int16_t *d, const int16_t *a, const int16_t *b, int N
";
specialize
qw/vp10_wedge_compute_delta_squares sse2/
;
}
}
# end encoder functions
1
;
vp10/encoder/rdopt.c
View file @
78842b28
This diff is collapsed.
Click to expand it.
vp10/encoder/wedge_utils.c
0 → 100644
View file @
78842b28
/*
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <assert.h>
#include "vpx/vpx_integer.h"
#include "vpx_ports/mem.h"
#include "vpx_dsp/vpx_dsp_common.h"
#include "vp10/common/reconinter.h"
#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
/**
* Computes SSE of a compound predictor constructed from 2 fundamental
* predictors p0 and p1 using blending with mask.
*
* r1: Residuals of p1.
* (source - p1)
* d: Difference of p1 and p0.
* (p1 - p0)
* m: The blending mask
* N: Number of pixels
*
* 'r1', 'd', and 'm' are contiguous.
*
* Computes:
* Sum((MAX_MASK_VALUE*r1 + mask*d)**2), which is equivalent to:
* Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2),
* where r0 is (source - p0), and r1 is (source - p1), which is in turn
* is equivalent to:
* Sum((source*MAX_MASK_VALUE - (mask*p0 + (MAX_MASK_VALUE-mask)*p1))**2),
* which is the SSE of the residuals of the compound predictor scaled up by
* MAX_MASK_VALUE**2.
*
* Note that we clamp the partial term in the loop to 16 bits signed. This is
* to facilitate equivalent SIMD implementation. It should have no effect if
* residuals are within 16 - WEDGE_WEIGHT_BITS (=10) signed, which always
* holds for 8 bit input, and on real input, it should hold practically always,
* as residuals are expected to be small.
*/
uint64_t
vp10_wedge_sse_from_residuals_c
(
const
int16_t
*
r1
,
const
int16_t
*
d
,
const
uint8_t
*
m
,
int
N
)
{
uint64_t
csse
=
0
;
int
i
;
assert
(
N
%
64
==
0
);
for
(
i
=
0
;
i
<
N
;
i
++
)
{
int32_t
t
=
MAX_MASK_VALUE
*
r1
[
i
]
+
m
[
i
]
*
d
[
i
];
t
=
clamp
(
t
,
INT16_MIN
,
INT16_MAX
);
csse
+=
t
*
t
;
}
return
ROUND_POWER_OF_TWO
(
csse
,
2
*
WEDGE_WEIGHT_BITS
);
}
/**
* Choose the mask sign for a compound predictor.
*
* ds: Difference of the squares of the residuals.
* r0**2 - r1**2
* m: The blending mask
* N: Number of pixels
* limit: Pre-computed threshold value.
* MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2))
*
* 'ds' and 'm' are contiguous.
*
* Returns true if the negated mask has lower SSE compared to the positive
* mask. Computation is based on:
* Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2)
* >
* Sum(((MAX_MASK_VALUE-mask)*r0 + mask*r1)**2)
*
* which can be simplified to:
*
* Sum(mask*(r0**2 - r1**2)) > MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2))
*
* The right hand side does not depend on the mask, and needs to be passed as
* the 'limit' parameter.
*
* After pre-computing (r0**2 - r1**2), which is passed in as 'ds', the left
* hand side is simply a scalar product between an int16_t and uint8_t vector.
*
* Note that for efficiency, ds is stored on 16 bits. Real input residuals
* being small, this should not cause a noticeable issue.
*/
int
vp10_wedge_sign_from_residuals_c
(
const
int16_t
*
ds
,
const
uint8_t
*
m
,
int
N
,
int64_t
limit
)
{
int64_t
acc
=
0
;
assert
(
N
%
64
==
0
);
do
{
acc
+=
*
ds
++
*
*
m
++
;
}
while
(
--
N
);
return
acc
>
limit
;
}
/**
* Compute the element-wise difference of the squares of 2 arrays.
*
* d: Difference of the squares of the inputs: a**2 - b**2
* a: First input array
* b: Second input array
* N: Number of elements
*
* 'd', 'a', and 'b' are contiguous.
*
* The result is saturated to signed 16 bits.
*/
void
vp10_wedge_compute_delta_squares_c
(
int16_t
*
d
,
const
int16_t
*
a
,
const
int16_t
*
b
,
int
N
)
{
int
i
;
assert
(
N
%
64
==
0
);
for
(
i
=
0
;
i
<
N
;
i
++
)
d
[
i
]
=
clamp
(
a
[
i
]
*
a
[
i
]
-
b
[
i
]
*
b
[
i
],
INT16_MIN
,
INT16_MAX
);
}
vp10/encoder/x86/wedge_utils_sse2.c
0 → 100644
View file @
78842b28
/*
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <assert.h>
#include <immintrin.h>
#include "vpx_dsp/x86/synonyms.h"
#include "vpx/vpx_integer.h"
#include "vp10/common/reconinter.h"
#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
/**
* See vp10_wedge_sse_from_residuals_c
*/
uint64_t
vp10_wedge_sse_from_residuals_sse2
(
const
int16_t
*
r1
,
const
int16_t
*
d
,
const
uint8_t
*
m
,
int
N
)
{
int
n
=
-
N
;
int
n8
=
n
+
8
;
uint64_t
csse
;
const
__m128i
v_mask_max_w
=
_mm_set1_epi16
(
MAX_MASK_VALUE
);
const
__m128i
v_zext_q
=
_mm_set_epi32
(
0
,
0xffffffff
,
0
,
0xffffffff
);
__m128i
v_acc0_q
=
_mm_setzero_si128
();
assert
(
N
%
64
==
0
);
r1
+=
N
;
d
+=
N
;
m
+=
N
;
do
{
const
__m128i
v_r0_w
=
xx_load_128
(
r1
+
n
);
const
__m128i
v_r1_w
=
xx_load_128
(
r1
+
n8
);
const
__m128i
v_d0_w
=
xx_load_128
(
d
+
n
);
const
__m128i
v_d1_w
=
xx_load_128
(
d
+
n8
);
const
__m128i
v_m01_b
=
xx_load_128
(
m
+
n
);
const
__m128i
v_rd0l_w
=
_mm_unpacklo_epi16
(
v_d0_w
,
v_r0_w
);
const
__m128i
v_rd0h_w
=
_mm_unpackhi_epi16
(
v_d0_w
,
v_r0_w
);
const
__m128i
v_rd1l_w
=
_mm_unpacklo_epi16
(
v_d1_w
,
v_r1_w
);
const
__m128i
v_rd1h_w
=
_mm_unpackhi_epi16
(
v_d1_w
,
v_r1_w
);
const
__m128i
v_m0_w
=
_mm_unpacklo_epi8
(
v_m01_b
,
_mm_setzero_si128
());
const
__m128i
v_m1_w
=
_mm_unpackhi_epi8
(
v_m01_b
,
_mm_setzero_si128
());
const
__m128i
v_m0l_w
=
_mm_unpacklo_epi16
(
v_m0_w
,
v_mask_max_w
);
const
__m128i
v_m0h_w
=
_mm_unpackhi_epi16
(
v_m0_w
,
v_mask_max_w
);
const
__m128i
v_m1l_w
=
_mm_unpacklo_epi16
(
v_m1_w
,
v_mask_max_w
);
const
__m128i
v_m1h_w
=
_mm_unpackhi_epi16
(
v_m1_w
,
v_mask_max_w
);
const
__m128i
v_t0l_d
=
_mm_madd_epi16
(
v_rd0l_w
,
v_m0l_w
);
const
__m128i
v_t0h_d
=
_mm_madd_epi16
(
v_rd0h_w
,
v_m0h_w
);
const
__m128i
v_t1l_d
=
_mm_madd_epi16
(
v_rd1l_w
,
v_m1l_w
);
const
__m128i
v_t1h_d
=
_mm_madd_epi16
(
v_rd1h_w
,
v_m1h_w
);
const
__m128i
v_t0_w
=
_mm_packs_epi32
(
v_t0l_d
,
v_t0h_d
);
const
__m128i
v_t1_w
=
_mm_packs_epi32
(
v_t1l_d
,
v_t1h_d
);
const
__m128i
v_sq0_d
=
_mm_madd_epi16
(
v_t0_w
,
v_t0_w
);
const
__m128i
v_sq1_d
=
_mm_madd_epi16
(
v_t1_w
,
v_t1_w
);
const
__m128i
v_sum0_q
=
_mm_add_epi64
(
_mm_and_si128
(
v_sq0_d
,
v_zext_q
),
_mm_srli_epi64
(
v_sq0_d
,
32
));
const
__m128i
v_sum1_q
=
_mm_add_epi64
(
_mm_and_si128
(
v_sq1_d
,
v_zext_q
),
_mm_srli_epi64
(
v_sq1_d
,
32
));
v_acc0_q
=
_mm_add_epi64
(
v_acc0_q
,
v_sum0_q
);
v_acc0_q
=
_mm_add_epi64
(
v_acc0_q
,
v_sum1_q
);
n8
+=
16
;
n
+=
16
;
}
while
(
n
);
v_acc0_q
=
_mm_add_epi64
(
v_acc0_q
,
_mm_srli_si128
(
v_acc0_q
,
8
));
#if ARCH_X86_64
csse
=
(
uint64_t
)
_mm_cvtsi128_si64
(
v_acc0_q
);
#else
xx_storel_64
(
&
csse
,
v_acc0_q
);
#endif
return
ROUND_POWER_OF_TWO
(
csse
,
2
*
WEDGE_WEIGHT_BITS
);
}
/**
* See vp10_wedge_sign_from_residuals_c
*/
int
vp10_wedge_sign_from_residuals_sse2
(
const
int16_t
*
ds
,
const
uint8_t
*
m
,
int
N
,
int64_t
limit
)
{
int64_t
acc
;
__m128i
v_sign_d
;
__m128i
v_acc0_d
=
_mm_setzero_si128
();
__m128i
v_acc1_d
=
_mm_setzero_si128
();
__m128i
v_acc_q
;
// Input size limited to 8192 by the use of 32 bit accumulators and m
// being between [0, 64]. Overflow might happen at larger sizes,
// though it is practically impossible on real video input.
assert
(
N
<
8192
);
assert
(
N
%
64
==
0
);
do
{
const
__m128i
v_m01_b
=
xx_load_128
(
m
);
const
__m128i
v_m23_b
=
xx_load_128
(
m
+
16
);
const
__m128i
v_m45_b
=
xx_load_128
(
m
+
32
);
const
__m128i
v_m67_b
=
xx_load_128
(
m
+
48
);
const
__m128i
v_d0_w
=
xx_load_128
(
ds
);
const
__m128i
v_d1_w
=
xx_load_128
(
ds
+
8
);
const
__m128i
v_d2_w
=
xx_load_128
(
ds
+
16
);
const
__m128i
v_d3_w
=
xx_load_128
(
ds
+
24
);
const
__m128i
v_d4_w
=
xx_load_128
(
ds
+
32
);
const
__m128i
v_d5_w
=
xx_load_128
(
ds
+
40
);
const
__m128i
v_d6_w
=
xx_load_128
(
ds
+
48
);
const
__m128i
v_d7_w
=
xx_load_128
(
ds
+
56
);
const
__m128i
v_m0_w
=
_mm_unpacklo_epi8
(
v_m01_b
,
_mm_setzero_si128
());
const
__m128i
v_m1_w
=
_mm_unpackhi_epi8
(
v_m01_b
,
_mm_setzero_si128
());
const
__m128i
v_m2_w
=
_mm_unpacklo_epi8
(
v_m23_b
,
_mm_setzero_si128
());
const
__m128i
v_m3_w
=
_mm_unpackhi_epi8
(
v_m23_b
,
_mm_setzero_si128
());
const
__m128i
v_m4_w
=
_mm_unpacklo_epi8
(
v_m45_b
,
_mm_setzero_si128
());
const
__m128i
v_m5_w
=
_mm_unpackhi_epi8
(
v_m45_b
,
_mm_setzero_si128
());
const
__m128i
v_m6_w
=
_mm_unpacklo_epi8
(
v_m67_b
,
_mm_setzero_si128
());
const
__m128i
v_m7_w
=
_mm_unpackhi_epi8
(
v_m67_b
,
_mm_setzero_si128
());
const
__m128i
v_p0_d
=
_mm_madd_epi16
(
v_d0_w
,
v_m0_w
);
const
__m128i
v_p1_d
=
_mm_madd_epi16
(
v_d1_w
,
v_m1_w
);
const
__m128i
v_p2_d
=
_mm_madd_epi16
(
v_d2_w
,
v_m2_w
);
const
__m128i
v_p3_d
=
_mm_madd_epi16
(
v_d3_w
,
v_m3_w
);
const
__m128i
v_p4_d
=
_mm_madd_epi16
(
v_d4_w
,
v_m4_w
);
const
__m128i
v_p5_d
=
_mm_madd_epi16
(
v_d5_w
,
v_m5_w
);
const
__m128i
v_p6_d
=
_mm_madd_epi16
(
v_d6_w
,
v_m6_w
);
const
__m128i
v_p7_d
=
_mm_madd_epi16
(
v_d7_w
,
v_m7_w
);
const
__m128i
v_p01_d
=
_mm_add_epi32
(
v_p0_d
,
v_p1_d
);
const
__m128i
v_p23_d
=
_mm_add_epi32
(
v_p2_d
,
v_p3_d
);
const
__m128i
v_p45_d
=
_mm_add_epi32
(
v_p4_d
,
v_p5_d
);
const
__m128i
v_p67_d
=
_mm_add_epi32
(
v_p6_d
,
v_p7_d
);
const
__m128i
v_p0123_d
=
_mm_add_epi32
(
v_p01_d
,
v_p23_d
);
const
__m128i
v_p4567_d
=
_mm_add_epi32
(
v_p45_d
,
v_p67_d
);
v_acc0_d
=
_mm_add_epi32
(
v_acc0_d
,
v_p0123_d
);
v_acc1_d
=
_mm_add_epi32
(
v_acc1_d
,
v_p4567_d
);
ds
+=
64
;
m
+=
64
;
N
-=
64
;
}
while
(
N
);
v_sign_d
=
_mm_cmplt_epi32
(
v_acc0_d
,
_mm_setzero_si128
());
v_acc0_d
=
_mm_add_epi64
(
_mm_unpacklo_epi32
(
v_acc0_d
,
v_sign_d
),
_mm_unpackhi_epi32
(
v_acc0_d
,
v_sign_d
));
v_sign_d
=
_mm_cmplt_epi32
(
v_acc1_d
,
_mm_setzero_si128
());
v_acc1_d
=
_mm_add_epi64
(
_mm_unpacklo_epi32
(
v_acc1_d
,
v_sign_d
),
_mm_unpackhi_epi32
(
v_acc1_d
,
v_sign_d
));
v_acc_q
=
_mm_add_epi64
(
v_acc0_d
,
v_acc1_d
);
v_acc_q
=
_mm_add_epi64
(
v_acc_q
,
_mm_srli_si128
(
v_acc_q
,
8
));
#if ARCH_X86_64
acc
=
(
uint64_t
)
_mm_cvtsi128_si64
(
v_acc_q
);
#else
xx_storel_64
(
&
acc
,
v_acc_q
);
#endif
return
acc
>
limit
;
}
// Negate under mask
static
INLINE
__m128i
negm_epi16
(
__m128i
v_v_w
,
__m128i
v_mask_w
)
{
return
_mm_sub_epi16
(
_mm_xor_si128
(
v_v_w
,
v_mask_w
),
v_mask_w
);
}
/**
* vp10_wedge_compute_delta_squares_c
*/
void
vp10_wedge_compute_delta_squares_sse2
(
int16_t
*
d
,
const
int16_t
*
a
,
const
int16_t
*
b
,
int
N
)
{
const
__m128i
v_neg_w
=
_mm_set_epi16
(
0xffff
,
0
,
0xffff
,
0
,
0xffff
,
0
,
0xffff
,
0
);
assert
(
N
%
64
==
0
);
do
{
const
__m128i
v_a0_w
=
xx_load_128
(
a
);
const
__m128i
v_b0_w
=
xx_load_128
(
b
);
const
__m128i
v_a1_w
=
xx_load_128
(
a
+
8
);
const
__m128i
v_b1_w
=
xx_load_128
(
b
+
8
);
const
__m128i
v_a2_w
=
xx_load_128
(
a
+
16
);
const
__m128i
v_b2_w
=
xx_load_128
(
b
+
16
);
const
__m128i
v_a3_w
=
xx_load_128
(
a
+
24
);
const
__m128i
v_b3_w
=
xx_load_128
(
b
+
24
);
const
__m128i
v_ab0l_w
=
_mm_unpacklo_epi16
(
v_a0_w
,
v_b0_w
);
const
__m128i
v_ab0h_w
=
_mm_unpackhi_epi16
(
v_a0_w
,
v_b0_w
);
const
__m128i
v_ab1l_w
=
_mm_unpacklo_epi16
(
v_a1_w
,
v_b1_w
);
const
__m128i
v_ab1h_w
=
_mm_unpackhi_epi16
(
v_a1_w
,
v_b1_w
);
const
__m128i
v_ab2l_w
=
_mm_unpacklo_epi16
(
v_a2_w
,
v_b2_w
);
const
__m128i
v_ab2h_w
=
_mm_unpackhi_epi16
(
v_a2_w
,
v_b2_w
);
const
__m128i
v_ab3l_w
=
_mm_unpacklo_epi16
(
v_a3_w
,
v_b3_w
);
const
__m128i
v_ab3h_w
=
_mm_unpackhi_epi16
(
v_a3_w
,
v_b3_w
);
// Negate top word of pairs
const
__m128i
v_abl0n_w
=
negm_epi16
(
v_ab0l_w
,
v_neg_w
);
const
__m128i
v_abh0n_w
=
negm_epi16
(
v_ab0h_w
,
v_neg_w
);
const
__m128i
v_abl1n_w
=
negm_epi16
(
v_ab1l_w
,
v_neg_w
);
const
__m128i
v_abh1n_w
=
negm_epi16
(
v_ab1h_w
,
v_neg_w
);
const
__m128i
v_abl2n_w
=
negm_epi16
(
v_ab2l_w
,
v_neg_w
);
const
__m128i
v_abh2n_w
=
negm_epi16
(
v_ab2h_w
,
v_neg_w
);
const
__m128i
v_abl3n_w
=
negm_epi16
(
v_ab3l_w
,
v_neg_w
);
const
__m128i
v_abh3n_w
=
negm_epi16
(
v_ab3h_w
,
v_neg_w
);
const
__m128i
v_r0l_w
=
_mm_madd_epi16
(
v_ab0l_w
,
v_abl0n_w
);
const
__m128i
v_r0h_w
=
_mm_madd_epi16
(
v_ab0h_w
,
v_abh0n_w
);
const
__m128i
v_r1l_w
=
_mm_madd_epi16
(
v_ab1l_w
,
v_abl1n_w
);
const
__m128i
v_r1h_w
=
_mm_madd_epi16
(
v_ab1h_w
,
v_abh1n_w
);
const
__m128i
v_r2l_w
=
_mm_madd_epi16
(
v_ab2l_w
,
v_abl2n_w
);
const
__m128i
v_r2h_w
=
_mm_madd_epi16
(
v_ab2h_w
,
v_abh2n_w
);
const
__m128i
v_r3l_w
=
_mm_madd_epi16
(
v_ab3l_w
,
v_abl3n_w
);
const
__m128i
v_r3h_w
=
_mm_madd_epi16
(
v_ab3h_w
,
v_abh3n_w
);
const
__m128i
v_r0_w
=
_mm_packs_epi32
(
v_r0l_w
,
v_r0h_w
);
const
__m128i
v_r1_w
=
_mm_packs_epi32
(
v_r1l_w
,
v_r1h_w
);
const
__m128i
v_r2_w
=
_mm_packs_epi32
(
v_r2l_w
,
v_r2h_w
);
const
__m128i
v_r3_w
=
_mm_packs_epi32
(
v_r3l_w
,
v_r3h_w
);
xx_store_128
(
d
,
v_r0_w
);
xx_store_128
(
d
+
8
,
v_r1_w
);
xx_store_128
(
d
+
16
,
v_r2_w
);
xx_store_128
(
d
+
24
,
v_r3_w
);
a
+=
32
;
b
+=
32
;
d
+=
32
;
N
-=
32
;
}
while
(
N
);
}
vp10/vp10cx.mk
View file @
78842b28
...
...
@@ -124,6 +124,10 @@ endif
ifeq
($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
VP10_CX_SRCS-$(HAVE_SSE2)
+=
encoder/x86/denoiser_sse2.c
endif
ifeq
($(CONFIG_EXT_INTER),yes)
VP10_CX_SRCS-yes
+=
encoder/wedge_utils.c
VP10_CX_SRCS-$(HAVE_SSE2)
+=
encoder/x86/wedge_utils_sse2.c
endif
VP10_CX_SRCS-$(HAVE_AVX2)
+=
encoder/x86/error_intrin_avx2.c
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment