Commit 3556deac authored by Johann's avatar Johann
Browse files

combine loopfilter data access

The data processed by the loopfilter overlaps. At the block level, this
results in some redundant transforms. Grouping the filtering allows for
a single 16x16 transpose (and inversion) instead of three 16x8 transposes
(and three more inversions).

This implementation is x86_64 only. We retain the previous
implementation for x86.

Improvements are obviously material dependant, but it seems to be ~%1 in
tests here.

Change-Id: I467b7ec3655be98fb5f1a94b5d145e5e5a660007
parent 6f9457ec
This diff is collapsed.
......@@ -271,6 +271,7 @@
%endmacro
%if ABI_IS_32BIT
;void vp8_loop_filter_horizontal_edge_sse2
;(
......@@ -321,6 +322,7 @@ sym(vp8_loop_filter_horizontal_edge_sse2):
pop rbp
ret
%endif
;void vp8_loop_filter_horizontal_edge_uv_sse2
;(
......@@ -1005,6 +1007,7 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
movd [rdi+2*rcx+2], %2
%endmacro
%if ABI_IS_32BIT
;void vp8_loop_filter_vertical_edge_sse2
;(
......@@ -1072,6 +1075,7 @@ sym(vp8_loop_filter_vertical_edge_sse2):
pop rbp
ret
%endif
;void vp8_loop_filter_vertical_edge_uv_sse2
;(
......
......@@ -17,8 +17,13 @@ prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_mmx);
prototype_loopfilter(vp8_loop_filter_vertical_edge_mmx);
prototype_loopfilter(vp8_loop_filter_horizontal_edge_mmx);
#if HAVE_SSE2 && ARCH_X86_64
prototype_loopfilter(vp8_loop_filter_bv_y_sse2);
prototype_loopfilter(vp8_loop_filter_bh_y_sse2);
#else
prototype_loopfilter(vp8_loop_filter_vertical_edge_sse2);
prototype_loopfilter(vp8_loop_filter_horizontal_edge_sse2);
#endif
prototype_loopfilter(vp8_mbloop_filter_vertical_edge_sse2);
prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_sse2);
......@@ -132,9 +137,13 @@ void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsign
void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
#if ARCH_X86_64
vp8_loop_filter_bh_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
#else
vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
#endif
if (u_ptr)
vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4 * uv_stride);
......@@ -153,9 +162,13 @@ void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned
void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
#if ARCH_X86_64
vp8_loop_filter_bv_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
#else
vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
#endif
if (u_ptr)
vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4);
......
......@@ -99,6 +99,9 @@ ifeq ($(CONFIG_POSTPROC),yes)
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/postproc_sse2.asm
endif
ifeq ($(ARCH_X86_64),yes)
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_block_sse2.asm
endif
# common (c)
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/arm_systemdependent.c
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment