Commit 8b0cf5f7 authored by Johann's avatar Johann
Browse files

x86 sse2 temporal_filter_apply

count can be reduced to short because the max number of filtered frames
is set to 15. the max value for any frame is 32 (modifier = 16,
filter_weight = 2). 15*32 = 480 which requires 9 bits

this function goes from about 7000 us / 1000 iterations for the C code
to < 275 us / 1000 iterations for sse2 for block_size = 16 and from
about 1800 us / 1000 iters to < 100 us / 1000 iters for block_size = 8

Change-Id: I64a32607f58a2d33c39286f468b04ccd457d9e6e
parent b095d9df
......@@ -36,36 +36,9 @@
#define ALT_REF_MC_ENABLED 1 // dis/enable MC in AltRef filtering
#define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering
#define USE_FILTER_LUT 0 // use lookup table to improve filter
#if VP8_TEMPORAL_ALT_REF
#if USE_FILTER_LUT
// for (strength = 0; strength <= 6; strength++) {
// for (delta = 0; delta <= 18; delta++) {
// float coeff = (3.0 * delta * delta) / pow(2, strength);
// printf("%3d", (int)roundf(coeff > 16 ? 0 : 16-coeff));
// }
// printf("\n");
// }
static int modifier_lut[7][19] =
{
// Strength=0
{16, 13, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
// Strength=1
{16, 15, 10, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
// Strength=2
{16, 15, 13, 9, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
// Strength=3
{16, 16, 15, 13, 10, 7, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
// Strength=4
{16, 16, 15, 14, 13, 11, 9, 7, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
// Strength=5
{16, 16, 16, 15, 15, 14, 13, 11, 10, 8, 7, 5, 3, 0, 0, 0, 0, 0, 0},
// Strength=6
{16, 16, 16, 16, 15, 15, 14, 14, 13, 12, 11, 10, 9, 8, 7, 5, 4, 2, 1}
};
#endif
static void vp8_temporal_filter_predictors_mb_c
(
MACROBLOCKD *x,
......@@ -86,14 +59,11 @@ static void vp8_temporal_filter_predictors_mb_c
if ((mv_row | mv_col) & 7)
{
// vp8_sixtap_predict16x16_c(yptr, stride,
// mv_col & 7, mv_row & 7, &pred[0], 16);
x->subpixel_predict16x16(yptr, stride,
mv_col & 7, mv_row & 7, &pred[0], 16);
}
else
{
//vp8_copy_mem16x16_c (yptr, stride, &pred[0], 16);
RECON_INVOKE(&x->rtcd->recon, copy16x16)(yptr, stride, &pred[0], 16);
}
......@@ -127,17 +97,13 @@ void vp8_temporal_filter_apply_c
int strength,
int filter_weight,
unsigned int *accumulator,
unsigned int *count
unsigned short *count
)
{
int i, j, k;
int modifier;
int byte = 0;
#if USE_FILTER_LUT
int *lut = modifier_lut[strength];
#endif
for (i = 0,k = 0; i < block_size; i++)
{
for (j = 0; j < block_size; j++, k++)
......@@ -146,11 +112,10 @@ void vp8_temporal_filter_apply_c
int src_byte = frame1[byte];
int pixel_value = *frame2++;
#if USE_FILTER_LUT
modifier = abs(src_byte-pixel_value);
modifier = modifier>18 ? 0 : lut[modifier];
#else
modifier = src_byte - pixel_value;
// This is an integer approximation of:
// float coeff = (3.0 * modifer * modifier) / pow(2, strength);
// modifier = (int)roundf(coeff > 16 ? 0 : 16-coeff);
modifier *= modifier;
modifier *= 3;
modifier += 1 << (strength - 1);
......@@ -160,7 +125,6 @@ void vp8_temporal_filter_apply_c
modifier = 16;
modifier = 16 - modifier;
#endif
modifier *= filter_weight;
count[k] += modifier;
......@@ -331,12 +295,12 @@ static void vp8_temporal_filter_iterate_c
int MBs = cpi->common.MBs;
int mb_y_offset = 0;
int mb_uv_offset = 0;
unsigned int accumulator[384];
unsigned int count[384];
DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16*16 + 8*8 + 8*8);
DECLARE_ALIGNED_ARRAY(16, unsigned short, count, 16*16 + 8*8 + 8*8);
MACROBLOCKD *mbd = &cpi->mb.e_mbd;
YV12_BUFFER_CONFIG *f = cpi->frames[alt_ref_index];
unsigned char *dst1, *dst2;
DECLARE_ALIGNED(16, unsigned char, predictor[384]);
DECLARE_ALIGNED_ARRAY(16, unsigned char, predictor, 16*16 + 8*8 + 8*8);
// Save input state
unsigned char *y_buffer = mbd->pre.y_buffer;
......@@ -366,7 +330,7 @@ static void vp8_temporal_filter_iterate_c
int stride;
vpx_memset(accumulator, 0, 384*sizeof(unsigned int));
vpx_memset(count, 0, 384*sizeof(unsigned int));
vpx_memset(count, 0, 384*sizeof(unsigned short));
#if ALT_REF_MC_ENABLED
// Reduced search extent by 3 for 6-tap filter & smaller UMV border
......
......@@ -22,9 +22,13 @@
int strength, \
int filter_weight, \
unsigned int *accumulator, \
unsigned int *count \
unsigned short *count \
)
#if ARCH_X86 || ARCH_X86_64
#include "x86/temporal_filter_x86.h"
#endif
#ifndef vp8_temporal_filter_apply
#define vp8_temporal_filter_apply vp8_temporal_filter_apply_c
#endif
......
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
; void vp8_temporal_filter_apply_sse2 | arg
; (unsigned char *frame1, | 0
; unsigned int stride, | 1
; unsigned char *frame2, | 2
; unsigned int block_size, | 3
; int strength, | 4
; int filter_weight, | 5
; unsigned int *accumulator, | 6
; unsigned short *count) | 7
global sym(vp8_temporal_filter_apply_sse2)
sym(vp8_temporal_filter_apply_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 8
SAVE_XMM
GET_GOT rbx
push rsi
push rdi
ALIGN_STACK 16, rax
%define block_size 0
%define strength 16
%define filter_weight 32
%define rounding_bit 48
%define rbp_backup 64
%define stack_size 80
sub rsp, stack_size
mov [rsp + rbp_backup], rbp
; end prolog
mov rdx, arg(3)
mov [rsp + block_size], rdx
movd xmm6, arg(4)
movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
; calculate the rounding bit outside the loop
; 0x8000 >> (16 - strength)
mov rdx, 16
sub rdx, arg(4) ; 16 - strength
movd xmm4, rdx ; can't use rdx w/ shift
movdqa xmm5, [GLOBAL(_const_top_bit)]
psrlw xmm5, xmm4
movdqa [rsp + rounding_bit], xmm5
mov rsi, arg(0) ; src/frame1
mov rdx, arg(2) ; predictor frame
mov rdi, arg(6) ; accumulator
mov rax, arg(7) ; count
; dup the filter weight and store for later
movd xmm0, arg(5) ; filter_weight
pshuflw xmm0, xmm0, 0
punpcklwd xmm0, xmm0
movdqa [rsp + filter_weight], xmm0
mov rbp, arg(1) ; stride
pxor xmm7, xmm7 ; zero for extraction
lea rcx, [rdx + 16*16*1]
cmp dword ptr [rsp + block_size], 8
jne temporal_filter_apply_load_16
lea rcx, [rdx + 8*8*1]
temporal_filter_apply_load_8:
movq xmm0, [rsi] ; first row
lea rsi, [rsi + rbp] ; += stride
punpcklbw xmm0, xmm7 ; src[ 0- 7]
movq xmm1, [rsi] ; second row
lea rsi, [rsi + rbp] ; += stride
punpcklbw xmm1, xmm7 ; src[ 8-15]
jmp temporal_filter_apply_load_finished
temporal_filter_apply_load_16:
movdqa xmm0, [rsi] ; src (frame1)
lea rsi, [rsi + rbp] ; += stride
movdqa xmm1, xmm0
punpcklbw xmm0, xmm7 ; src[ 0- 7]
punpckhbw xmm1, xmm7 ; src[ 8-15]
temporal_filter_apply_load_finished:
movdqa xmm2, [rdx] ; predictor (frame2)
movdqa xmm3, xmm2
punpcklbw xmm2, xmm7 ; pred[ 0- 7]
punpckhbw xmm3, xmm7 ; pred[ 8-15]
; modifier = src_byte - pixel_value
psubw xmm0, xmm2 ; src - pred[ 0- 7]
psubw xmm1, xmm3 ; src - pred[ 8-15]
; modifier *= modifier
pmullw xmm0, xmm0 ; modifer[ 0- 7]^2
pmullw xmm1, xmm1 ; modifer[ 8-15]^2
; modifier *= 3
pmullw xmm0, [GLOBAL(_const_3w)]
pmullw xmm1, [GLOBAL(_const_3w)]
; modifer += 0x8000 >> (16 - strength)
paddw xmm0, [rsp + rounding_bit]
paddw xmm1, [rsp + rounding_bit]
; modifier >>= strength
psrlw xmm0, [rsp + strength]
psrlw xmm1, [rsp + strength]
; modifier = 16 - modifier
; saturation takes care of modifier > 16
movdqa xmm3, [GLOBAL(_const_16w)]
movdqa xmm2, [GLOBAL(_const_16w)]
psubusw xmm3, xmm1
psubusw xmm2, xmm0
; modifier *= filter_weight
pmullw xmm2, [rsp + filter_weight]
pmullw xmm3, [rsp + filter_weight]
; count
movdqa xmm4, [rax]
movdqa xmm5, [rax+16]
; += modifier
paddw xmm4, xmm2
paddw xmm5, xmm3
; write back
movdqa [rax], xmm4
movdqa [rax+16], xmm5
lea rax, [rax + 16*2] ; count += 16*(sizeof(short))
; load and extract the predictor up to shorts
pxor xmm7, xmm7
movdqa xmm0, [rdx]
lea rdx, [rdx + 16*1] ; pred += 16*(sizeof(char))
movdqa xmm1, xmm0
punpcklbw xmm0, xmm7 ; pred[ 0- 7]
punpckhbw xmm1, xmm7 ; pred[ 8-15]
; modifier *= pixel_value
pmullw xmm0, xmm2
pmullw xmm1, xmm3
; expand to double words
movdqa xmm2, xmm0
punpcklwd xmm0, xmm7 ; [ 0- 3]
punpckhwd xmm2, xmm7 ; [ 4- 7]
movdqa xmm3, xmm1
punpcklwd xmm1, xmm7 ; [ 8-11]
punpckhwd xmm3, xmm7 ; [12-15]
; accumulator
movdqa xmm4, [rdi]
movdqa xmm5, [rdi+16]
movdqa xmm6, [rdi+32]
movdqa xmm7, [rdi+48]
; += modifier
paddw xmm4, xmm0
paddw xmm5, xmm2
paddw xmm6, xmm1
paddw xmm7, xmm3
; write back
movdqa [rdi], xmm4
movdqa [rdi+16], xmm5
movdqa [rdi+32], xmm6
movdqa [rdi+48], xmm7
lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int))
cmp rdx, rcx
je temporal_filter_apply_epilog
pxor xmm7, xmm7 ; zero for extraction
cmp dword ptr [rsp + block_size], 16
je temporal_filter_apply_load_16
jmp temporal_filter_apply_load_8
temporal_filter_apply_epilog:
; begin epilog
mov rbp, [rsp + rbp_backup]
add rsp, stack_size
pop rsp
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
SECTION_RODATA
align 16
_const_3w:
times 8 dw 3
align 16
_const_top_bit:
times 8 dw 1<<15
align 16
_const_16w
times 8 dw 16
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef __INC_VP8_TEMPORAL_FILTER_X86_H
#define __INC_VP8_TEMPORAL_FILTER_X86_H
#if HAVE_SSE2
extern prototype_apply(vp8_temporal_filter_apply_sse2);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_temporal_filter_apply
#define vp8_temporal_filter_apply vp8_temporal_filter_apply_sse2
#endif
#endif
#endif // __INC_VP8_TEMPORAL_FILTER_X86_H
......@@ -309,6 +309,8 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
/*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2;*/
cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse2;
cpi->rtcd.temporal.apply = vp8_temporal_filter_apply_sse2;
}
#endif
......
......@@ -94,6 +94,7 @@ VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/dct_x86.h
VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/mcomp_x86.h
VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/variance_x86.h
VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_x86.h
VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/temporal_filter_x86.h
VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/x86_csystemdependent.c
VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/variance_mmx.c
VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/variance_impl_mmx.asm
......@@ -107,6 +108,7 @@ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/sad_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm
VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm
VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment