Skip to content
Snippets Groups Projects
Commit 3bf7b131 authored by Yunqing Wang's avatar Yunqing Wang Committed by Gerrit Code Review
Browse files

Merge "Improve sad3x16 SSE2 function" into experimental

parents fbc8e8f9 e7cd8071
No related branches found
No related tags found
No related merge requests found
......@@ -45,15 +45,13 @@ vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc,
unsigned int vp9_sad3x16_c(const unsigned char *src_ptr,
int src_stride,
const unsigned char *ref_ptr,
int ref_stride,
int max_sad) {
int ref_stride) {
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 3, 16);
}
unsigned int vp9_sad16x3_c(const unsigned char *src_ptr,
int src_stride,
const unsigned char *ref_ptr,
int ref_stride,
int max_sad) {
int ref_stride) {
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 3);
}
......@@ -230,23 +228,23 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
score = 0;
if (xd->up_available) {
score += vp9_sad16x3(above_src, xd->dst.y_stride,
above_ref + offset, ref_y_stride, INT_MAX);
above_ref + offset, ref_y_stride);
#if CONFIG_SUPERBLOCKS
if (xd->mode_info_context->mbmi.encoded_as_sb) {
score += vp9_sad16x3(above_src + 16, xd->dst.y_stride,
above_ref + offset + 16, ref_y_stride, INT_MAX);
above_ref + offset + 16, ref_y_stride);
}
#endif
}
if (xd->left_available) {
score += vp9_sad3x16(left_src, xd->dst.y_stride,
left_ref + offset, ref_y_stride, INT_MAX);
left_ref + offset, ref_y_stride);
#if CONFIG_SUPERBLOCKS
if (xd->mode_info_context->mbmi.encoded_as_sb) {
score += vp9_sad3x16(left_src + xd->dst.y_stride * 16,
xd->dst.y_stride,
left_ref + offset + ref_y_stride * 16,
ref_y_stride, INT_MAX);
ref_y_stride);
}
#endif
}
......
......@@ -221,10 +221,10 @@ vp9_loop_filter_simple_bh_neon=vp9_loop_filter_bhs_neon
#
# sad 16x3, 3x16
#
prototype unsigned int vp9_sad16x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"
prototype unsigned int vp9_sad16x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride"
specialize vp9_sad16x3 sse2
prototype unsigned int vp9_sad3x16 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"
prototype unsigned int vp9_sad3x16 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride"
specialize vp9_sad3x16 sse2
#
......
......@@ -11,21 +11,18 @@
#include <emmintrin.h> // SSE2
#include "./vpx_config.h"
#include "./vp9_rtcd.h"
#include "vpx/vpx_integer.h"
#if HAVE_SSE2
unsigned int vp9_sad16x3_sse2(
const unsigned char *src_ptr,
int src_stride,
const unsigned char *ref_ptr,
int ref_stride,
int max_sad) {
int ref_stride) {
__m128i s0, s1, s2;
__m128i r0, r1, r2;
__m128i sad;
(void)max_sad;
s0 = _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_stride));
s1 = _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_stride));
s2 = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_stride));
......@@ -46,12 +43,25 @@ unsigned int vp9_sad3x16_sse2(
const unsigned char *src_ptr,
int src_stride,
const unsigned char *ref_ptr,
int ref_stride,
int max_sad) {
int ref_stride) {
int r;
__m128i s0, s1, s2, s3;
__m128i r0, r1, r2, r3;
__m128i sad = _mm_set1_epi16(0);
__m128i sad = _mm_setzero_si128();
__m128i mask;
const int offset = (uintptr_t)src_ptr & 3;
/* In current use case, the offset is 1 if CONFIG_SUBPELREFMV is off.
* Here, for offset=1, we adjust src_ptr to be 4-byte aligned. Then, movd
* takes much less time.
*/
if (offset == 1)
src_ptr -= 1;
/* mask = 0xffffffffffff0000ffffffffffff0000 */
mask = _mm_cmpeq_epi32(sad, sad);
mask = _mm_slli_epi64(mask, 16);
for (r = 0; r < 16; r += 4) {
s0 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 0 * src_stride));
s1 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 1 * src_stride));
......@@ -69,8 +79,11 @@ unsigned int vp9_sad3x16_sse2(
s0 = _mm_unpacklo_epi64(s0, s2);
r0 = _mm_unpacklo_epi64(r0, r2);
// throw out byte 3
s0 = _mm_slli_epi64(s0, 16);
// throw out extra byte
if (offset == 1)
s0 = _mm_and_si128(s0, mask);
else
s0 = _mm_slli_epi64(s0, 16);
r0 = _mm_slli_epi64(r0, 16);
sad = _mm_add_epi16(sad, _mm_sad_epu8(s0, r0));
......@@ -84,5 +97,3 @@ unsigned int vp9_sad3x16_sse2(
}
#endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment