Commit 085433c2 authored by Scott LaVarnway's avatar Scott LaVarnway

sse2 intrinsic version of vp8_mbloop_filter_vertical_edge()

First sse2 version of vp8_mbloop_filter_vertical_edge().  For now,
intrinsics are being used until the bitstream is finalized.  This function
will be revisited later for further performance improvements.

For the test clip used, a 34+% decoder performance improvement
was seen.  This will vary depending on material.

Change-Id: I455b438bc8d8af76cf7533ac42eda5f689b21f7c
parent 992b5e2d
......@@ -271,7 +271,6 @@ void vp8_mbloop_filter_horizontal_edge_c
} while (++i < count * 8);
}
void vp8_mbloop_filter_vertical_edge_c
(
unsigned char *s,
......
......@@ -125,13 +125,13 @@ specialize vp8_comp_intra_uv4x4_predict;
# Loopfilter
#
prototype void vp8_loop_filter_mbv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
specialize vp8_loop_filter_mbv;
specialize vp8_loop_filter_mbv sse2
prototype void vp8_loop_filter_bv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
specialize vp8_loop_filter_bv;
prototype void vp8_loop_filter_bv8x8 "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
specialize vp8_loop_filter_bv8x8;
specialize vp8_loop_filter_bv8x8 sse2
prototype void vp8_loop_filter_mbh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
specialize vp8_loop_filter_mbh sse2
......
This diff is collapsed.
......@@ -380,193 +380,6 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2):
ret
%macro MB_FILTER_AND_WRITEBACK 1
%if %1 == 0
movdqa xmm2, p1 ; p1
movdqa xmm7, q1 ; q1
%elif %1 == 1
movdqa xmm2, [rsi+2*rax] ; p1
movdqa xmm7, [rdi] ; q1
mov rcx, rax
neg rcx
%elif %1 == 2
lea rdx, srct
movdqa xmm2, [rdx+32] ; p1
movdqa xmm7, [rdx+80] ; q1
movdqa xmm6, [rdx+48] ; p0
movdqa xmm0, [rdx+64] ; q0
%endif
pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values
pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values
psubsb xmm2, xmm7 ; p1 - q1
movdqa xmm3, xmm0 ; q0
psubsb xmm0, xmm6 ; q0 - p0
paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1)
paddsb xmm2, xmm0 ; 2 * (q0 - p0)
paddsb xmm2, xmm0 ; 3 * (q0 - p0) + (p1 - q1)
pand xmm1, xmm2 ; mask filter values we don't care about
movdqa xmm2, xmm1 ; vp8_filter
pand xmm2, xmm4 ; Filter2 = vp8_filter & hev
pxor xmm0, xmm0
pandn xmm4, xmm1 ; vp8_filter&=~hev
pxor xmm1, xmm1
punpcklbw xmm0, xmm4 ; Filter 2 (hi)
movdqa xmm5, xmm2
punpckhbw xmm1, xmm4 ; Filter 2 (lo)
paddsb xmm5, [GLOBAL(t3)] ; vp8_signed_char_clamp(Filter2 + 3)
pmulhw xmm1, [GLOBAL(s9)] ; Filter 2 (lo) * 9
pmulhw xmm0, [GLOBAL(s9)] ; Filter 2 (hi) * 9
punpckhbw xmm7, xmm5 ; axbxcxdx
paddsb xmm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4)
punpcklbw xmm5, xmm5 ; exfxgxhx
psraw xmm7, 11 ; sign extended shift right by 3
psraw xmm5, 11 ; sign extended shift right by 3
punpckhbw xmm4, xmm2 ; axbxcxdx
punpcklbw xmm2, xmm2 ; exfxgxhx
psraw xmm4, 11 ; sign extended shift right by 3
packsswb xmm5, xmm7 ; Filter2 >>=3;
psraw xmm2, 11 ; sign extended shift right by 3
packsswb xmm2, xmm4 ; Filter1 >>=3;
movdqa xmm7, xmm1
paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2
movdqa xmm4, xmm1
psubsb xmm3, xmm2 ; qs0 =qs0 - Filter1
movdqa xmm5, xmm0
movdqa xmm2, xmm5
paddw xmm0, [GLOBAL(s63)] ; Filter 2 (hi) * 9 + 63
paddw xmm1, [GLOBAL(s63)] ; Filter 2 (lo) * 9 + 63
paddw xmm5, xmm5 ; Filter 2 (hi) * 18
paddw xmm7, xmm7 ; Filter 2 (lo) * 18
paddw xmm5, xmm0 ; Filter 2 (hi) * 27 + 63
paddw xmm7, xmm1 ; Filter 2 (lo) * 27 + 63
paddw xmm2, xmm0 ; Filter 2 (hi) * 18 + 63
paddw xmm4, xmm1 ; Filter 2 (lo) * 18 + 63
psraw xmm0, 7 ; (Filter 2 (hi) * 9 + 63) >> 7
psraw xmm1, 7 ; (Filter 2 (lo) * 9 + 63) >> 7
psraw xmm2, 7 ; (Filter 2 (hi) * 18 + 63) >> 7
packsswb xmm0, xmm1 ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
psraw xmm4, 7 ; (Filter 2 (lo) * 18 + 63) >> 7
psraw xmm5, 7 ; (Filter 2 (hi) * 27 + 63) >> 7
packsswb xmm2, xmm4 ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
psraw xmm7, 7 ; (Filter 2 (lo) * 27 + 63) >> 7
packsswb xmm5, xmm7 ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
psubsb xmm3, xmm5 ; sq = vp8_signed_char_clamp(qs0 - u3)
paddsb xmm6, xmm5 ; sp = vp8_signed_char_clamp(ps0 - u3)
%if %1 == 0
movdqa xmm5, q2 ; q2
movdqa xmm1, q1 ; q1
movdqa xmm4, p1 ; p1
movdqa xmm7, p2 ; p2
%elif %1 == 1
movdqa xmm5, XMMWORD PTR [rdi+rcx] ; q2
movdqa xmm1, XMMWORD PTR [rdi] ; q1
movdqa xmm4, XMMWORD PTR [rsi+rax*2] ; p1
movdqa xmm7, XMMWORD PTR [rdi+rax*4] ; p2
%elif %1 == 2
movdqa xmm5, XMMWORD PTR [rdx+96] ; q2
movdqa xmm1, XMMWORD PTR [rdx+80] ; q1
movdqa xmm4, XMMWORD PTR [rdx+32] ; p1
movdqa xmm7, XMMWORD PTR [rdx+16] ; p2
%endif
pxor xmm3, [GLOBAL(t80)] ; *oq0 = sq^0x80
pxor xmm6, [GLOBAL(t80)] ; *oq0 = sp^0x80
pxor xmm1, [GLOBAL(t80)]
pxor xmm4, [GLOBAL(t80)]
psubsb xmm1, xmm2 ; sq = vp8_signed_char_clamp(qs1 - u2)
paddsb xmm4, xmm2 ; sp = vp8_signed_char_clamp(ps1 - u2)
pxor xmm1, [GLOBAL(t80)] ; *oq1 = sq^0x80;
pxor xmm4, [GLOBAL(t80)] ; *op1 = sp^0x80;
pxor xmm7, [GLOBAL(t80)]
pxor xmm5, [GLOBAL(t80)]
paddsb xmm7, xmm0 ; sp = vp8_signed_char_clamp(ps2 - u)
psubsb xmm5, xmm0 ; sq = vp8_signed_char_clamp(qs2 - u)
pxor xmm7, [GLOBAL(t80)] ; *op2 = sp^0x80;
pxor xmm5, [GLOBAL(t80)] ; *oq2 = sq^0x80;
%if %1 == 0
lea rsi, [rsi+rcx*2]
lea rdi, [rdi+rcx*2]
movq MMWORD PTR [rsi], xmm6 ; p0
movhps MMWORD PTR [rdi], xmm6
movq MMWORD PTR [rsi + rcx], xmm3 ; q0
movhps MMWORD PTR [rdi + rcx], xmm3
movq MMWORD PTR [rsi+rcx*2], xmm1 ; q1
movhps MMWORD PTR [rdi+rcx*2], xmm1
movq MMWORD PTR [rsi + rax], xmm4 ; p1
movhps MMWORD PTR [rdi + rax], xmm4
movq MMWORD PTR [rsi+rax*2], xmm7 ; p2
movhps MMWORD PTR [rdi+rax*2], xmm7
lea rsi, [rsi + rcx]
lea rdi, [rdi + rcx]
movq MMWORD PTR [rsi+rcx*2], xmm5 ; q2
movhps MMWORD PTR [rdi+rcx*2], xmm5
%elif %1 == 1
movdqa XMMWORD PTR [rdi+rcx], xmm5 ; q2
movdqa XMMWORD PTR [rdi], xmm1 ; q1
movdqa XMMWORD PTR [rsi], xmm3 ; q0
movdqa XMMWORD PTR [rsi+rax ],xmm6 ; p0
movdqa XMMWORD PTR [rsi+rax*2],xmm4 ; p1
movdqa XMMWORD PTR [rdi+rax*4],xmm7 ; p2
%elif %1 == 2
movdqa XMMWORD PTR [rdx+80], xmm1 ; q1
movdqa XMMWORD PTR [rdx+64], xmm3 ; q0
movdqa XMMWORD PTR [rdx+48], xmm6 ; p0
movdqa XMMWORD PTR [rdx+32], xmm4 ; p1
%endif
%endmacro
%macro TRANSPOSE_16X8 2
movq xmm4, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
movq xmm1, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
......@@ -1032,233 +845,6 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2):
pop rbp
ret
%macro MBV_TRANSPOSE 0
movdqa xmm0, [rdx] ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
movdqa xmm1, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
punpcklbw xmm0, xmm7 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
punpckhbw xmm1, xmm7 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
movdqa xmm2, [rdx+32] ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
movdqa xmm6, xmm2 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
punpcklbw xmm2, [rdx+48] ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
punpckhbw xmm6, [rdx+48] ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
movdqa xmm3, xmm0 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
punpcklwd xmm0, xmm2 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
punpckhwd xmm3, xmm2 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
movdqa xmm4, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
punpcklwd xmm1, xmm6 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
punpckhwd xmm4, xmm6 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
movdqa xmm2, [rdx+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
punpcklbw xmm2, [rdx+80] ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
movdqa xmm6, xmm5 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
punpcklbw xmm6, [rdx+112] ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
movdqa xmm7, xmm2 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
punpcklwd xmm2, xmm6 ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
punpckhwd xmm7, xmm6 ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
movdqa xmm6, xmm0 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
punpckldq xmm0, xmm2 ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
punpckhdq xmm6, xmm2 ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
%endmacro
%macro MBV_WRITEBACK_1 0
movq QWORD PTR [rsi], xmm0
movhps MMWORD PTR [rdi], xmm0
movq QWORD PTR [rsi+2*rax], xmm6
movhps MMWORD PTR [rdi+2*rax], xmm6
movdqa xmm0, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
punpckldq xmm0, xmm7 ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
punpckhdq xmm3, xmm7 ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
movq QWORD PTR [rsi+4*rax], xmm0
movhps MMWORD PTR [rdi+4*rax], xmm0
movq QWORD PTR [rsi+2*rcx], xmm3
movhps MMWORD PTR [rdi+2*rcx], xmm3
movdqa xmm2, [rdx+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
punpckhbw xmm2, [rdx+80] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
punpckhbw xmm5, [rdx+112] ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
movdqa xmm0, xmm2
punpcklwd xmm0, xmm5 ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
punpckhwd xmm2, xmm5 ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
movdqa xmm5, xmm1 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
punpckldq xmm1, xmm0 ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
punpckhdq xmm5, xmm0 ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
%endmacro
%macro MBV_WRITEBACK_2 0
movq QWORD PTR [rsi], xmm1
movhps MMWORD PTR [rdi], xmm1
movq QWORD PTR [rsi+2*rax], xmm5
movhps MMWORD PTR [rdi+2*rax], xmm5
movdqa xmm1, xmm4 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
punpckldq xmm1, xmm2 ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
punpckhdq xmm4, xmm2 ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
movq QWORD PTR [rsi+4*rax], xmm1
movhps MMWORD PTR [rdi+4*rax], xmm1
movq QWORD PTR [rsi+2*rcx], xmm4
movhps MMWORD PTR [rdi+2*rcx], xmm4
%endmacro
;void vp8_mbloop_filter_vertical_edge_sse2
;(
; unsigned char *src_ptr,
; int src_pixel_step,
; const char *blimit,
; const char *limit,
; const char *thresh,
; int count
;)
global sym(vp8_mbloop_filter_vertical_edge_sse2)
sym(vp8_mbloop_filter_vertical_edge_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
ALIGN_STACK 16, rax
sub rsp, 160 ; reserve 160 bytes
%define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
%define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
%define srct [rsp + 32] ;__declspec(align(16)) char srct[128];
mov rsi, arg(0) ; src_ptr
movsxd rax, dword ptr arg(1) ; src_pixel_step
lea rsi, [rsi - 4]
lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
lea rcx, [rax*2+rax]
; Transpose
TRANSPOSE_16X8 1, 0
; calculate filter mask and high edge variance
LFV_FILTER_MASK_HEV_MASK 0
neg rax
; start work on filters
MB_FILTER_AND_WRITEBACK 2
lea rsi, [rsi+rax*8]
lea rdi, [rdi+rax*8]
; transpose and write back
MBV_TRANSPOSE
neg rax
MBV_WRITEBACK_1
lea rsi, [rsi+rax*8]
lea rdi, [rdi+rax*8]
MBV_WRITEBACK_2
add rsp, 160
pop rsp
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp8_mbloop_filter_vertical_edge_uv_sse2
;(
; unsigned char *u,
; int src_pixel_step,
; const char *blimit,
; const char *limit,
; const char *thresh,
; unsigned char *v
;)
global sym(vp8_mbloop_filter_vertical_edge_uv_sse2)
sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
ALIGN_STACK 16, rax
sub rsp, 160 ; reserve 160 bytes
%define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
%define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
%define srct [rsp + 32] ;__declspec(align(16)) char srct[128];
mov rsi, arg(0) ; u_ptr
movsxd rax, dword ptr arg(1) ; src_pixel_step
lea rsi, [rsi - 4]
lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
lea rcx, [rax+2*rax]
lea rdx, srct
; Transpose
TRANSPOSE_16X8 0, 0
; calculate filter mask and high edge variance
LFV_FILTER_MASK_HEV_MASK 0
; start work on filters
MB_FILTER_AND_WRITEBACK 2
; transpose and write back
MBV_TRANSPOSE
mov rsi, arg(0) ;u_ptr
lea rsi, [rsi - 4]
lea rdi, [rsi + rax]
MBV_WRITEBACK_1
mov rsi, arg(5) ;v_ptr
lea rsi, [rsi - 4]
lea rdi, [rsi + rax]
MBV_WRITEBACK_2
add rsp, 160
pop rsp
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp8_loop_filter_simple_horizontal_edge_sse2
;(
; unsigned char *src_ptr,
......
......@@ -13,17 +13,14 @@
#include "vpx_config.h"
#include "vp8/common/loopfilter.h"
prototype_loopfilter(vp8_mbloop_filter_vertical_edge_mmx);
prototype_loopfilter(vp8_loop_filter_vertical_edge_mmx);
prototype_loopfilter(vp8_loop_filter_horizontal_edge_mmx);
prototype_loopfilter(vp8_loop_filter_vertical_edge_sse2);
prototype_loopfilter(vp8_loop_filter_horizontal_edge_sse2);
prototype_loopfilter(vp8_mbloop_filter_vertical_edge_sse2);
extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_sse2;
extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2;
extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_sse2;
#if HAVE_MMX
/* Horizontal MB filtering */
......@@ -35,13 +32,6 @@ void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
/* Vertical MB Filtering */
void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, struct loop_filter_info *lfi) {
vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
}
......@@ -340,6 +330,107 @@ void vp8_mbloop_filter_horizontal_edge_c_sse2
}
}
}
static __inline void transpose(unsigned char *src[], int in_p,
unsigned char *dst[], int out_p,
int num_8x8_to_transpose) {
int idx8x8 = 0;
__m128i x0, x1, x2, x3, x4, x5, x6, x7;
do {
unsigned char *in = src[idx8x8];
unsigned char *out = dst[idx8x8];
x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p)); // 00 01 02 03 04 05 06 07
x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p)); // 10 11 12 13 14 15 16 17
x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p)); // 20 21 22 23 24 25 26 27
x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p)); // 30 31 32 33 34 35 36 37
x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p)); // 40 41 42 43 44 45 46 47
x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p)); // 50 51 52 53 54 55 56 57
x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p)); // 60 61 62 63 64 65 66 67
x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p)); // 70 71 72 73 74 75 76 77
// 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
x0 = _mm_unpacklo_epi8(x0, x1);
// 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
x1 = _mm_unpacklo_epi8(x2, x3);
// 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
x2 = _mm_unpacklo_epi8(x4, x5);
// 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
x3 = _mm_unpacklo_epi8(x6, x7);
// 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
x4 = _mm_unpacklo_epi16(x0, x1);
// 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
x5 = _mm_unpacklo_epi16(x2, x3);
// 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
x6 = _mm_unpacklo_epi32(x4, x5);
// 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
x7 = _mm_unpackhi_epi32(x4, x5);
_mm_storel_pd((double *)(out + 0*out_p),
(__m128d)x6); // 00 10 20 30 40 50 60 70
_mm_storeh_pd((double *)(out + 1*out_p),
(__m128d)x6); // 01 11 21 31 41 51 61 71
_mm_storel_pd((double *)(out + 2*out_p),
(__m128d)x7); // 02 12 22 32 42 52 62 72
_mm_storeh_pd((double *)(out + 3*out_p),
(__m128d)x7); // 03 13 23 33 43 53 63 73
// 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
x4 = _mm_unpackhi_epi16(x0, x1);
// 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
x5 = _mm_unpackhi_epi16(x2, x3);
// 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
x6 = _mm_unpacklo_epi32(x4, x5);
// 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
x7 = _mm_unpackhi_epi32(x4, x5);
_mm_storel_pd((double *)(out + 4*out_p),
(__m128d)x6); // 04 14 24 34 44 54 64 74
_mm_storeh_pd((double *)(out + 5*out_p),
(__m128d)x6); // 05 15 25 35 45 55 65 75
_mm_storel_pd((double *)(out + 6*out_p),
(__m128d)x7); // 06 16 26 36 46 56 66 76
_mm_storeh_pd((double *)(out + 7*out_p),
(__m128d)x7); // 07 17 27 37 47 57 67 77
} while (++idx8x8 < num_8x8_to_transpose);
}
void vp8_mbloop_filter_vertical_edge_c_sse2
(
unsigned char *s,
int p,
const unsigned char *blimit,
const unsigned char *limit,
const unsigned char *thresh,
int count
) {
DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 16]);
unsigned char *src[4];
unsigned char *dst[4];
src[0] = s - 5;
src[1] = s - 5 + 8;
src[2] = s - 5 + p*8;
src[3] = s - 5 + p*8 + 8;
dst[0] = t_dst;
dst[1] = t_dst + 16*8;
dst[2] = t_dst + 8;
dst[3] = t_dst + 16*8 + 8;
// 16x16->16x16 or 16x8->8x16
transpose(src, p, dst, 16, (1 << count));
vp8_mbloop_filter_horizontal_edge_c_sse2(t_dst + 5*16, 16, blimit, limit,
thresh, count);
dst[0] = s - 5;
dst[1] = s - 5 + p*8;
src[0] = t_dst;
src[1] = t_dst + 8;
// 16x8->8x16 or 8x8->8x8
transpose(src, 16, dst, p, (1 << (count - 1)));
}
/* Horizontal MB filtering */
void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
......@@ -366,14 +457,28 @@ void vp8_loop_filter_bh8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
}
/* Vertical MB Filtering */
void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, struct loop_filter_info *lfi) {
vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);