Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
A
aom-rav1e
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Incidents
Environments
Packages & Registries
Packages & Registries
Container Registry
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Xiph.Org
aom-rav1e
Commits
57cae22c
Commit
57cae22c
authored
Nov 05, 2015
by
Yunqing Wang
Committed by
Gerrit Code Review
Nov 05, 2015
Browse files
Options
Browse Files
Download
Plain Diff
Merge "Add AVX vectorized vp9_diamond_search_sad"
parents
c6641709
f1342a7b
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
353 additions
and
20 deletions
+353
-20
vp9/common/vp9_rtcd_defs.pl
vp9/common/vp9_rtcd_defs.pl
+1
-1
vp9/encoder/vp9_encoder.c
vp9/encoder/vp9_encoder.c
+30
-0
vp9/encoder/vp9_mcomp.c
vp9/encoder/vp9_mcomp.c
+10
-16
vp9/encoder/vp9_mcomp.h
vp9/encoder/vp9_mcomp.h
+3
-3
vp9/encoder/x86/vp9_diamond_search_sad_avx.c
vp9/encoder/x86/vp9_diamond_search_sad_avx.c
+308
-0
vp9/vp9cx.mk
vp9/vp9cx.mk
+1
-0
No files found.
vp9/common/vp9_rtcd_defs.pl
View file @
57cae22c
...
...
@@ -312,7 +312,7 @@ $vp9_full_search_sad_sse3=vp9_full_search_sadx3;
$vp9_full_search_sad_sse4_1
=
vp9_full_search_sadx8
;
add_proto
qw/int vp9_diamond_search_sad/
,
"
const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv
";
specialize
qw/vp9_diamond_search_sad/
;
specialize
qw/vp9_diamond_search_sad
avx
/
;
add_proto
qw/int vp9_full_range_search/
,
"
const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv
";
specialize
qw/vp9_full_range_search/
;
...
...
vp9/encoder/vp9_encoder.c
View file @
57cae22c
...
...
@@ -1570,7 +1570,30 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
#endif
#define log2f(x) (log (x) / (float) M_LOG2_E)
/***********************************************************************
* Read before modifying 'cal_nmvjointsadcost' or 'cal_nmvsadcosts' *
***********************************************************************
* The following 2 functions ('cal_nmvjointsadcost' and *
* 'cal_nmvsadcosts') are used to calculate cost lookup tables *
* used by 'vp9_diamond_search_sad'. The C implementation of the *
* function is generic, but the AVX intrinsics optimised version *
* relies on the following properties of the computed tables: *
* For cal_nmvjointsadcost: *
* - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] *
* For cal_nmvsadcosts: *
* - For all i: mvsadcost[0][i] == mvsadcost[1][i] *
* (Equal costs for both components) *
* - For all i: mvsadcost[0][i] == mvsadcost[0][-i] *
* (Cost function is even) *
* If these do not hold, then the AVX optimised version of the *
* 'vp9_diamond_search_sad' function cannot be used as it is, in which *
* case you can revert to using the C function instead. *
***********************************************************************/
static
void
cal_nmvjointsadcost
(
int
*
mvjointsadcost
)
{
/*********************************************************************
* Warning: Read the comments above before modifying this function *
*********************************************************************/
mvjointsadcost
[
0
]
=
600
;
mvjointsadcost
[
1
]
=
300
;
mvjointsadcost
[
2
]
=
300
;
...
...
@@ -1578,6 +1601,9 @@ static void cal_nmvjointsadcost(int *mvjointsadcost) {
}
static
void
cal_nmvsadcosts
(
int
*
mvsadcost
[
2
])
{
/*********************************************************************
* Warning: Read the comments above before modifying this function *
*********************************************************************/
int
i
=
1
;
mvsadcost
[
0
][
0
]
=
0
;
...
...
@@ -1739,6 +1765,10 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
cpi
->
first_time_stamp_ever
=
INT64_MAX
;
/*********************************************************************
* Warning: Read the comments around 'cal_nmvjointsadcost' and *
* 'cal_nmvsadcosts' before modifying how these tables are computed. *
*********************************************************************/
cal_nmvjointsadcost
(
cpi
->
td
.
mb
.
nmvjointsadcost
);
cpi
->
td
.
mb
.
nmvcost
[
0
]
=
&
cpi
->
nmvcosts
[
0
][
MV_MAX
];
cpi
->
td
.
mb
.
nmvcost
[
1
]
=
&
cpi
->
nmvcosts
[
1
][
MV_MAX
];
...
...
vp9/encoder/vp9_mcomp.c
View file @
57cae22c
...
...
@@ -101,11 +101,8 @@ static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref,
}
void
vp9_init_dsmotion_compensation
(
search_site_config
*
cfg
,
int
stride
)
{
int
len
,
ss_count
=
1
;
cfg
->
ss_mv
[
0
].
col
=
0
;
cfg
->
ss_mv
[
0
].
row
=
0
;
cfg
->
ss_os
[
0
]
=
0
;
int
len
;
int
ss_count
=
0
;
for
(
len
=
MAX_FIRST_STEP
;
len
>
0
;
len
/=
2
)
{
// Generate offsets for 4 search sites per step.
...
...
@@ -117,16 +114,13 @@ void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride) {
}
}
cfg
->
ss_count
=
ss_count
;
cfg
->
searches_per_step
=
4
;
cfg
->
total_steps
=
ss_count
/
cfg
->
searches_per_step
;
}
void
vp9_init3smotion_compensation
(
search_site_config
*
cfg
,
int
stride
)
{
int
len
,
ss_count
=
1
;
cfg
->
ss_mv
[
0
].
col
=
0
;
cfg
->
ss_mv
[
0
].
row
=
0
;
cfg
->
ss_os
[
0
]
=
0
;
int
len
;
int
ss_count
=
0
;
for
(
len
=
MAX_FIRST_STEP
;
len
>
0
;
len
/=
2
)
{
// Generate offsets for 8 search sites per step.
...
...
@@ -141,8 +135,8 @@ void vp9_init3smotion_compensation(search_site_config *cfg, int stride) {
}
}
cfg
->
ss_count
=
ss_count
;
cfg
->
searches_per_step
=
8
;
cfg
->
total_steps
=
ss_count
/
cfg
->
searches_per_step
;
}
/*
...
...
@@ -1612,8 +1606,8 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x,
const
uint8_t
*
best_address
;
unsigned
int
bestsad
=
INT_MAX
;
int
best_site
=
0
;
int
last_site
=
0
;
int
best_site
=
-
1
;
int
last_site
=
-
1
;
int
ref_row
;
int
ref_col
;
...
...
@@ -1626,7 +1620,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x,
// const search_site *ss = &cfg->ss[search_param * cfg->searches_per_step];
const
MV
*
ss_mv
=
&
cfg
->
ss_mv
[
search_param
*
cfg
->
searches_per_step
];
const
intptr_t
*
ss_os
=
&
cfg
->
ss_os
[
search_param
*
cfg
->
searches_per_step
];
const
int
tot_steps
=
(
cfg
->
ss_count
/
cfg
->
searches_per_step
)
-
search_param
;
const
int
tot_steps
=
(
cfg
->
total_steps
)
-
search_param
;
const
MV
fcenter_mv
=
{
center_mv
->
row
>>
3
,
center_mv
->
col
>>
3
};
clamp_mv
(
ref_mv
,
x
->
mv_col_min
,
x
->
mv_col_max
,
x
->
mv_row_min
,
x
->
mv_row_max
);
...
...
@@ -1644,7 +1638,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x,
bestsad
=
fn_ptr
->
sdf
(
what
,
what_stride
,
in_what
,
in_what_stride
)
+
mvsad_err_cost
(
x
,
best_mv
,
&
fcenter_mv
,
sad_per_bit
);
i
=
1
;
i
=
0
;
for
(
step
=
0
;
step
<
tot_steps
;
step
++
)
{
int
all_in
=
1
,
t
;
...
...
vp9/encoder/vp9_mcomp.h
View file @
57cae22c
...
...
@@ -33,10 +33,10 @@ extern "C" {
typedef
struct
search_site_config
{
// motion search sites
MV
ss_mv
[
8
*
MAX_MVSEARCH_STEPS
+
1
];
// Motion vector
intptr_t
ss_os
[
8
*
MAX_MVSEARCH_STEPS
+
1
];
// Offset
int
ss_count
;
MV
ss_mv
[
8
*
MAX_MVSEARCH_STEPS
];
// Motion vector
intptr_t
ss_os
[
8
*
MAX_MVSEARCH_STEPS
];
// Offset
int
searches_per_step
;
int
total_steps
;
}
search_site_config
;
void
vp9_init_dsmotion_compensation
(
search_site_config
*
cfg
,
int
stride
);
...
...
vp9/encoder/x86/vp9_diamond_search_sad_avx.c
0 → 100644
View file @
57cae22c
/*
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <emmintrin.h>
#include <smmintrin.h>
#include "vpx_dsp/vpx_dsp_common.h"
#include "vp9/encoder/vp9_encoder.h"
#include "vpx_ports/mem.h"
#ifdef __GNUC__
# define __likely__(v) __builtin_expect(v, 1)
# define __unlikely__(v) __builtin_expect(v, 0)
#else
# define __likely__(v) (v)
# define __unlikely__(v) (v)
#endif
static
INLINE
MV_JOINT_TYPE
get_mv_joint
(
const
int_mv
mv
)
{
// This is simplified from the C implementation to utilise that
// x->nmvjointsadcost[1] == x->nmvjointsadcost[2] and
// x->nmvjointsadcost[1] == x->nmvjointsadcost[3]
return
mv
.
as_int
==
0
?
0
:
1
;
}
static
INLINE
int
mv_cost
(
const
int_mv
mv
,
const
int
*
joint_cost
,
int
*
const
comp_cost
[
2
])
{
return
joint_cost
[
get_mv_joint
(
mv
)]
+
comp_cost
[
0
][
mv
.
as_mv
.
row
]
+
comp_cost
[
1
][
mv
.
as_mv
.
col
];
}
static
int
mvsad_err_cost
(
const
MACROBLOCK
*
x
,
const
int_mv
mv
,
const
MV
*
ref
,
int
error_per_bit
)
{
const
int_mv
diff
=
{
.
as_mv
=
{
mv
.
as_mv
.
row
-
ref
->
row
,
mv
.
as_mv
.
col
-
ref
->
col
}
};
return
ROUND_POWER_OF_TWO
(
mv_cost
(
diff
,
x
->
nmvjointsadcost
,
x
->
nmvsadcost
)
*
error_per_bit
,
8
);
}
/*****************************************************************************
* This function utilises 3 properties of the cost function lookup tables, *
* constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in *
* vp9_encoder.c. *
* For the joint cost: *
* - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] *
* For the component costs: *
* - For all i: mvsadcost[0][i] == mvsadcost[1][i] *
* (Equal costs for both components) *
* - For all i: mvsadcost[0][i] == mvsadcost[0][-i] *
* (Cost function is even) *
* If these do not hold, then this function cannot be used without *
* modification, in which case you can revert to using the C implementation, *
* which does not rely on these properties. *
*****************************************************************************/
int
vp9_diamond_search_sad_avx
(
const
MACROBLOCK
*
x
,
const
search_site_config
*
cfg
,
MV
*
ref_mv
,
MV
*
best_mv
,
int
search_param
,
int
sad_per_bit
,
int
*
num00
,
const
vp9_variance_fn_ptr_t
*
fn_ptr
,
const
MV
*
center_mv
)
{
const
int_mv
maxmv
=
{
.
as_mv
=
{
x
->
mv_row_max
,
x
->
mv_col_max
}
};
const
__m128i
v_max_mv_w
=
_mm_set1_epi32
(
maxmv
.
as_int
);
const
int_mv
minmv
=
{
.
as_mv
=
{
x
->
mv_row_min
,
x
->
mv_col_min
}
};
const
__m128i
v_min_mv_w
=
_mm_set1_epi32
(
minmv
.
as_int
);
const
__m128i
v_spb_d
=
_mm_set1_epi32
(
sad_per_bit
);
const
__m128i
v_joint_cost_0_d
=
_mm_set1_epi32
(
x
->
nmvjointsadcost
[
0
]);
const
__m128i
v_joint_cost_1_d
=
_mm_set1_epi32
(
x
->
nmvjointsadcost
[
1
]);
// search_param determines the length of the initial step and hence the number
// of iterations.
// 0 = initial step (MAX_FIRST_STEP) pel
// 1 = (MAX_FIRST_STEP/2) pel,
// 2 = (MAX_FIRST_STEP/4) pel...
const
MV
*
ss_mv
=
&
cfg
->
ss_mv
[
cfg
->
searches_per_step
*
search_param
];
const
intptr_t
*
ss_os
=
&
cfg
->
ss_os
[
cfg
->
searches_per_step
*
search_param
];
const
int
tot_steps
=
cfg
->
total_steps
-
search_param
;
const
int_mv
fcenter_mv
=
{
.
as_mv
=
{
center_mv
->
row
>>
3
,
center_mv
->
col
>>
3
}
};
const
__m128i
vfcmv
=
_mm_set1_epi32
(
fcenter_mv
.
as_int
);
const
int
ref_row
=
clamp
(
ref_mv
->
row
,
minmv
.
as_mv
.
row
,
maxmv
.
as_mv
.
row
);
const
int
ref_col
=
clamp
(
ref_mv
->
col
,
minmv
.
as_mv
.
col
,
maxmv
.
as_mv
.
col
);
int_mv
bmv
=
{
.
as_mv
=
{
ref_row
,
ref_col
}
};
int_mv
new_bmv
=
bmv
;
__m128i
v_bmv_w
=
_mm_set1_epi32
(
bmv
.
as_int
);
const
int
what_stride
=
x
->
plane
[
0
].
src
.
stride
;
const
uint8_t
*
const
what
=
x
->
plane
[
0
].
src
.
buf
;
const
int
in_what_stride
=
x
->
e_mbd
.
plane
[
0
].
pre
[
0
].
stride
;
const
uint8_t
*
const
in_what
=
x
->
e_mbd
.
plane
[
0
].
pre
[
0
].
buf
+
ref_row
*
in_what_stride
+
ref_col
;
// Work out the start point for the search
const
uint8_t
*
best_address
=
in_what
;
const
uint8_t
*
new_best_address
=
best_address
;
#if ARCH_X86_64
__m128i
v_ba_q
=
_mm_set1_epi64x
((
intptr_t
)
best_address
);
#else
__m128i
v_ba_d
=
_mm_set1_epi32
((
intptr_t
)
best_address
);
#endif
unsigned
int
best_sad
;
int
i
;
int
j
;
int
step
;
// Check the starting position
best_sad
=
fn_ptr
->
sdf
(
what
,
what_stride
,
in_what
,
in_what_stride
);
best_sad
+=
mvsad_err_cost
(
x
,
bmv
,
&
fcenter_mv
.
as_mv
,
sad_per_bit
);
*
num00
=
0
;
for
(
i
=
0
,
step
=
0
;
step
<
tot_steps
;
step
++
)
{
for
(
j
=
0
;
j
<
cfg
->
searches_per_step
;
j
+=
4
,
i
+=
4
)
{
__m128i
v_sad_d
;
__m128i
v_cost_d
;
__m128i
v_outside_d
;
__m128i
v_inside_d
;
__m128i
v_diff_mv_w
;
#if ARCH_X86_64
__m128i
v_blocka
[
2
];
#else
__m128i
v_blocka
[
1
];
#endif
// Compute the candidate motion vectors
const
__m128i
v_ss_mv_w
=
_mm_load_si128
((
const
__m128i
*
)
&
ss_mv
[
i
]);
const
__m128i
v_these_mv_w
=
_mm_add_epi16
(
v_bmv_w
,
v_ss_mv_w
);
// Clamp them to the search bounds
__m128i
v_these_mv_clamp_w
=
v_these_mv_w
;
v_these_mv_clamp_w
=
_mm_min_epi16
(
v_these_mv_clamp_w
,
v_max_mv_w
);
v_these_mv_clamp_w
=
_mm_max_epi16
(
v_these_mv_clamp_w
,
v_min_mv_w
);
// The ones that did not change are inside the search area
v_inside_d
=
_mm_cmpeq_epi32
(
v_these_mv_clamp_w
,
v_these_mv_w
);
// If none of them are inside, then move on
if
(
__likely__
(
_mm_test_all_zeros
(
v_inside_d
,
v_inside_d
)))
{
continue
;
}
// The inverse mask indicates which of the MVs are outside
v_outside_d
=
_mm_xor_si128
(
v_inside_d
,
_mm_set1_epi8
(
0xff
));
// Shift right to keep the sign bit clear, we will use this later
// to set the cost to the maximum value.
v_outside_d
=
_mm_srli_epi32
(
v_outside_d
,
1
);
// Compute the difference MV
v_diff_mv_w
=
_mm_sub_epi16
(
v_these_mv_clamp_w
,
vfcmv
);
// We utilise the fact that the cost function is even, and use the
// absolute difference. This allows us to use unsigned indexes later
// and reduces cache pressure somewhat as only a half of the table
// is ever referenced.
v_diff_mv_w
=
_mm_abs_epi16
(
v_diff_mv_w
);
// Compute the SIMD pointer offsets.
{
#if ARCH_X86_64 // sizeof(intptr_t) == 8
// Load the offsets (could use _mm_maskload_ps here, instead of the
// extra 'and' but it's slower that way)
__m128i
v_bo10_q
=
_mm_load_si128
((
const
__m128i
*
)
&
ss_os
[
i
+
0
]);
__m128i
v_bo32_q
=
_mm_load_si128
((
const
__m128i
*
)
&
ss_os
[
i
+
2
]);
// Set the ones falling outside to zero
v_bo10_q
=
_mm_and_si128
(
v_bo10_q
,
_mm_cvtepi32_epi64
(
v_inside_d
));
v_bo32_q
=
_mm_and_si128
(
v_bo32_q
,
_mm_unpackhi_epi32
(
v_inside_d
,
v_inside_d
));
// Compute the candidate addresses
v_blocka
[
0
]
=
_mm_add_epi64
(
v_ba_q
,
v_bo10_q
);
v_blocka
[
1
]
=
_mm_add_epi64
(
v_ba_q
,
v_bo32_q
);
#else // ARCH_X86 // sizeof(intptr_t) == 4
__m128i
v_bo_d
=
_mm_load_si128
((
const
__m128i
*
)
&
ss_os
[
i
]);
v_bo_d
=
_mm_and_si128
(
v_bo_d
,
v_inside_d
);
v_blocka
[
0
]
=
_mm_add_epi32
(
v_ba_d
,
v_bo_d
);
#endif
}
fn_ptr
->
sdx4df
(
what
,
what_stride
,
(
const
uint8_t
**
)
&
v_blocka
[
0
],
in_what_stride
,
(
uint32_t
*
)
&
v_sad_d
);
// Look up the component cost of the residual motion vector
{
const
int32_t
row0
=
_mm_extract_epi16
(
v_diff_mv_w
,
0
);
const
int32_t
col0
=
_mm_extract_epi16
(
v_diff_mv_w
,
1
);
const
int32_t
row1
=
_mm_extract_epi16
(
v_diff_mv_w
,
2
);
const
int32_t
col1
=
_mm_extract_epi16
(
v_diff_mv_w
,
3
);
const
int32_t
row2
=
_mm_extract_epi16
(
v_diff_mv_w
,
4
);
const
int32_t
col2
=
_mm_extract_epi16
(
v_diff_mv_w
,
5
);
const
int32_t
row3
=
_mm_extract_epi16
(
v_diff_mv_w
,
6
);
const
int32_t
col3
=
_mm_extract_epi16
(
v_diff_mv_w
,
7
);
// Note: This is a use case for vpgather in AVX2
const
uint32_t
cost0
=
x
->
nmvsadcost
[
0
][
row0
]
+
x
->
nmvsadcost
[
0
][
col0
];
const
uint32_t
cost1
=
x
->
nmvsadcost
[
0
][
row1
]
+
x
->
nmvsadcost
[
0
][
col1
];
const
uint32_t
cost2
=
x
->
nmvsadcost
[
0
][
row2
]
+
x
->
nmvsadcost
[
0
][
col2
];
const
uint32_t
cost3
=
x
->
nmvsadcost
[
0
][
row3
]
+
x
->
nmvsadcost
[
0
][
col3
];
__m128i
v_cost_10_d
;
__m128i
v_cost_32_d
;
v_cost_10_d
=
_mm_cvtsi32_si128
(
cost0
);
v_cost_10_d
=
_mm_insert_epi32
(
v_cost_10_d
,
cost1
,
1
);
v_cost_32_d
=
_mm_cvtsi32_si128
(
cost2
);
v_cost_32_d
=
_mm_insert_epi32
(
v_cost_32_d
,
cost3
,
1
);
v_cost_d
=
_mm_unpacklo_epi64
(
v_cost_10_d
,
v_cost_32_d
);
}
// Now add in the joint cost
{
const
__m128i
v_sel_d
=
_mm_cmpeq_epi32
(
v_diff_mv_w
,
_mm_setzero_si128
());
const
__m128i
v_joint_cost_d
=
_mm_blendv_epi8
(
v_joint_cost_1_d
,
v_joint_cost_0_d
,
v_sel_d
);
v_cost_d
=
_mm_add_epi32
(
v_cost_d
,
v_joint_cost_d
);
}
// Multiply by sad_per_bit
v_cost_d
=
_mm_mullo_epi32
(
v_cost_d
,
v_spb_d
);
// ROUND_POWER_OF_TWO(v_cost_d, 8)
v_cost_d
=
_mm_add_epi32
(
v_cost_d
,
_mm_set1_epi32
(
0x80
));
v_cost_d
=
_mm_srai_epi32
(
v_cost_d
,
8
);
// Add the cost to the sad
v_sad_d
=
_mm_add_epi32
(
v_sad_d
,
v_cost_d
);
// Make the motion vectors outside the search area have max cost
// by or'ing in the comparison mask, this way the minimum search won't
// pick them.
v_sad_d
=
_mm_or_si128
(
v_sad_d
,
v_outside_d
);
// Find the minimum value and index horizontally in v_sad_d
{
// Try speculatively on 16 bits, so we can use the minpos intrinsic
const
__m128i
v_sad_w
=
_mm_packus_epi32
(
v_sad_d
,
v_sad_d
);
const
__m128i
v_minp_w
=
_mm_minpos_epu16
(
v_sad_w
);
uint32_t
local_best_sad
=
_mm_extract_epi16
(
v_minp_w
,
0
);
uint32_t
local_best_idx
=
_mm_extract_epi16
(
v_minp_w
,
1
);
// If the local best value is not saturated, just use it, otherwise
// find the horizontal minimum again the hard way on 32 bits.
// This is executed rarely.
if
(
__unlikely__
(
local_best_sad
==
0xffff
))
{
__m128i
v_loval_d
,
v_hival_d
,
v_loidx_d
,
v_hiidx_d
,
v_sel_d
;
v_loval_d
=
v_sad_d
;
v_loidx_d
=
_mm_set_epi32
(
3
,
2
,
1
,
0
);
v_hival_d
=
_mm_srli_si128
(
v_loval_d
,
8
);
v_hiidx_d
=
_mm_srli_si128
(
v_loidx_d
,
8
);
v_sel_d
=
_mm_cmplt_epi32
(
v_hival_d
,
v_loval_d
);
v_loval_d
=
_mm_blendv_epi8
(
v_loval_d
,
v_hival_d
,
v_sel_d
);
v_loidx_d
=
_mm_blendv_epi8
(
v_loidx_d
,
v_hiidx_d
,
v_sel_d
);
v_hival_d
=
_mm_srli_si128
(
v_loval_d
,
4
);
v_hiidx_d
=
_mm_srli_si128
(
v_loidx_d
,
4
);
v_sel_d
=
_mm_cmplt_epi32
(
v_hival_d
,
v_loval_d
);
v_loval_d
=
_mm_blendv_epi8
(
v_loval_d
,
v_hival_d
,
v_sel_d
);
v_loidx_d
=
_mm_blendv_epi8
(
v_loidx_d
,
v_hiidx_d
,
v_sel_d
);
local_best_sad
=
_mm_extract_epi32
(
v_loval_d
,
0
);
local_best_idx
=
_mm_extract_epi32
(
v_loidx_d
,
0
);
}
// Update the global minimum if the local minimum is smaller
if
(
__likely__
(
local_best_sad
<
best_sad
))
{
new_bmv
=
((
const
int_mv
*
)
&
v_these_mv_w
)[
local_best_idx
];
new_best_address
=
((
const
uint8_t
**
)
v_blocka
)[
local_best_idx
];
best_sad
=
local_best_sad
;
}
}
}
bmv
=
new_bmv
;
best_address
=
new_best_address
;
v_bmv_w
=
_mm_set1_epi32
(
bmv
.
as_int
);
#if ARCH_X86_64
v_ba_q
=
_mm_set1_epi64x
((
intptr_t
)
best_address
);
#else
v_ba_d
=
_mm_set1_epi32
((
intptr_t
)
best_address
);
#endif
if
(
__unlikely__
(
best_address
==
in_what
))
{
(
*
num00
)
++
;
}
}
*
best_mv
=
bmv
.
as_mv
;
return
best_sad
;
}
vp9/vp9cx.mk
View file @
57cae22c
...
...
@@ -96,6 +96,7 @@ VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
VP9_CX_SRCS-$(HAVE_SSE2)
+=
encoder/x86/vp9_avg_intrin_sse2.c
VP9_CX_SRCS-$(HAVE_SSE2)
+=
encoder/x86/vp9_temporal_filter_apply_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2)
+=
encoder/x86/vp9_quantize_sse2.c
VP9_CX_SRCS-$(HAVE_AVX)
+=
encoder/x86/vp9_diamond_search_sad_avx.c
ifeq
($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_SSE2)
+=
encoder/x86/vp9_highbd_block_error_intrin_sse2.c
endif
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment