Commit 0b721db5 authored by Dmitry Kovalev's avatar Dmitry Kovalev
Browse files

Replacing asm 8x8 variance calculation with intrinsics.

New code is 10% faster for 64-bit and 25% faster for 32-bit. Compiled
using clang.

Change-Id: I8ba1544c30dd6f3ca479db806384317549650dfc
parent 72037944
......@@ -209,193 +209,3 @@ sym(vp9_get16x16var_sse2):
UNSHADOW_ARGS
pop rbp
ret
;unsigned int vp9_get8x8var_sse2
;(
; unsigned char * src_ptr,
; int source_stride,
; unsigned char * ref_ptr,
; int recon_stride,
; unsigned int * SSE,
; int * Sum
;)
global sym(vp9_get8x8var_sse2) PRIVATE
sym(vp9_get8x8var_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
sub rsp, 16
; end prolog
mov rsi, arg(0) ;[src_ptr]
mov rdi, arg(2) ;[ref_ptr]
movsxd rax, DWORD PTR arg(1) ;[source_stride]
movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
pxor xmm0, xmm0 ; clear xmm0 for unpack
pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
movq xmm1, QWORD PTR [rsi]
movq xmm2, QWORD PTR [rdi]
punpcklbw xmm1, xmm0
punpcklbw xmm2, xmm0
psubsw xmm1, xmm2
paddw xmm7, xmm1
pmaddwd xmm1, xmm1
movq xmm2, QWORD PTR[rsi + rax]
movq xmm3, QWORD PTR[rdi + rdx]
punpcklbw xmm2, xmm0
punpcklbw xmm3, xmm0
psubsw xmm2, xmm3
paddw xmm7, xmm2
pmaddwd xmm2, xmm2
paddd xmm1, xmm2
movq xmm2, QWORD PTR[rsi + rax * 2]
movq xmm3, QWORD PTR[rdi + rdx * 2]
punpcklbw xmm2, xmm0
punpcklbw xmm3, xmm0
psubsw xmm2, xmm3
paddw xmm7, xmm2
pmaddwd xmm2, xmm2
paddd xmm1, xmm2
lea rsi, [rsi + rax * 2]
lea rdi, [rdi + rdx * 2]
movq xmm2, QWORD PTR[rsi + rax]
movq xmm3, QWORD PTR[rdi + rdx]
punpcklbw xmm2, xmm0
punpcklbw xmm3, xmm0
psubsw xmm2, xmm3
paddw xmm7, xmm2
pmaddwd xmm2, xmm2
paddd xmm1, xmm2
movq xmm2, QWORD PTR[rsi + rax *2]
movq xmm3, QWORD PTR[rdi + rdx *2]
punpcklbw xmm2, xmm0
punpcklbw xmm3, xmm0
psubsw xmm2, xmm3
paddw xmm7, xmm2
pmaddwd xmm2, xmm2
paddd xmm1, xmm2
lea rsi, [rsi + rax * 2]
lea rdi, [rdi + rdx * 2]
movq xmm2, QWORD PTR[rsi + rax]
movq xmm3, QWORD PTR[rdi + rdx]
punpcklbw xmm2, xmm0
punpcklbw xmm3, xmm0
psubsw xmm2, xmm3
paddw xmm7, xmm2
pmaddwd xmm2, xmm2
paddd xmm1, xmm2
movq xmm2, QWORD PTR[rsi + rax *2]
movq xmm3, QWORD PTR[rdi + rdx *2]
punpcklbw xmm2, xmm0
punpcklbw xmm3, xmm0
psubsw xmm2, xmm3
paddw xmm7, xmm2
pmaddwd xmm2, xmm2
paddd xmm1, xmm2
lea rsi, [rsi + rax * 2]
lea rdi, [rdi + rdx * 2]
movq xmm2, QWORD PTR[rsi + rax]
movq xmm3, QWORD PTR[rdi + rdx]
punpcklbw xmm2, xmm0
punpcklbw xmm3, xmm0
psubsw xmm2, xmm3
paddw xmm7, xmm2
pmaddwd xmm2, xmm2
paddd xmm1, xmm2
movdqa xmm6, xmm7
punpcklwd xmm6, xmm0
punpckhwd xmm7, xmm0
movdqa xmm2, xmm1
paddw xmm6, xmm7
punpckldq xmm1, xmm0
punpckhdq xmm2, xmm0
movdqa xmm7, xmm6
paddd xmm1, xmm2
punpckldq xmm6, xmm0
punpckhdq xmm7, xmm0
paddw xmm6, xmm7
movdqa xmm2, xmm1
movdqa xmm7, xmm6
psrldq xmm1, 8
psrldq xmm6, 8
paddw xmm7, xmm6
paddd xmm1, xmm2
mov rax, arg(5) ;[Sum]
mov rdi, arg(4) ;[SSE]
movq rdx, xmm7
movsx rcx, dx
mov dword ptr [rax], ecx
movd DWORD PTR [rdi], xmm1
; begin epilog
add rsp, 16
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
......@@ -51,9 +51,46 @@ unsigned int vp9_get4x4var_sse2(const uint8_t *src, int src_stride,
return 0;
}
unsigned int vp9_get8x8var_sse2(const unsigned char *src, int src_stride,
const unsigned char *ref, int ref_stride,
unsigned int *sse, int *sum);
unsigned int vp9_get8x8var_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse, int *sum) {
const __m128i zero = _mm_setzero_si128();
__m128i vsum = _mm_setzero_si128();
__m128i vsse = _mm_setzero_si128();
int i;
for (i = 0; i < 8; i += 2) {
const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
(const __m128i *)(src + i * src_stride)), zero);
const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
(const __m128i *)(ref + i * ref_stride)), zero);
const __m128i diff0 = _mm_sub_epi16(src0, ref0);
const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
(const __m128i *)(src + (i + 1) * src_stride)), zero);
const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
(const __m128i *)(ref + (i + 1) * ref_stride)), zero);
const __m128i diff1 = _mm_sub_epi16(src1, ref1);
vsum = _mm_add_epi16(vsum, diff0);
vsum = _mm_add_epi16(vsum, diff1);
vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
}
// sum
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
*sum = (int16_t)_mm_extract_epi16(vsum, 0);
// sse
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
*sse = _mm_cvtsi128_si32(vsse);
return 0;
}
unsigned int vp9_get16x16var_sse2(const unsigned char *src, int src_stride,
const unsigned char *ref, int ref_stride,
......@@ -110,8 +147,7 @@ unsigned int vp9_variance8x8_sse2(const unsigned char *src, int src_stride,
const unsigned char *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
sse, &sum, vp9_get8x8var_sse2, 8);
vp9_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
return *sse - (((unsigned int)sum * sum) >> 6);
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment