Commit 1083fe49 authored by Ronald S. Bultje's avatar Ronald S. Bultje
Browse files

SSE2/SSSE3 optimizations for build_predictors_mbuv{,_s}().

decoding

before
10.425
10.432
10.423
=10.426

after:
10.405
10.416
10.398
=10.406, 0.2% faster

encoding

before
14.252
14.331
14.250
14.223
14.241
14.220
14.221
=14.248

after
14.095
14.090
14.085
14.095
14.064
14.081
14.089
=14.086, 1.1% faster

Change-Id: I483d3d8f0deda8ad434cea76e16028380722aee2
parent 0da77a84
......@@ -43,6 +43,12 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
vp8_build_intra_predictors_mby;
rtcd->recon.build_intra_predictors_mby_s =
vp8_build_intra_predictors_mby_s;
rtcd->recon.build_intra_predictors_mbuv =
vp8_build_intra_predictors_mbuv;
rtcd->recon.build_intra_predictors_mbuv_s =
vp8_build_intra_predictors_mbuv_s;
rtcd->recon.intra4x4_predict =
vp8_intra4x4_predict;
rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_c;
rtcd->subpix.sixtap8x8 = vp8_sixtap_predict8x8_c;
......
......@@ -26,6 +26,9 @@
#define prototype_build_intra_predictors(sym) \
void sym(MACROBLOCKD *x)
#define prototype_intra4x4_predict(sym) \
void sym(BLOCKD *x, int b_mode, unsigned char *predictor)
struct vp8_recon_rtcd_vtable;
#if ARCH_X86 || ARCH_X86_64
......@@ -88,11 +91,30 @@ extern prototype_build_intra_predictors\
extern prototype_build_intra_predictors\
(vp8_recon_build_intra_predictors_mby_s);
#ifndef vp8_recon_build_intra_predictors_mbuv
#define vp8_recon_build_intra_predictors_mbuv vp8_build_intra_predictors_mbuv
#endif
extern prototype_build_intra_predictors\
(vp8_recon_build_intra_predictors_mbuv);
#ifndef vp8_recon_build_intra_predictors_mbuv_s
#define vp8_recon_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s
#endif
extern prototype_build_intra_predictors\
(vp8_recon_build_intra_predictors_mbuv_s);
#ifndef vp8_recon_intra4x4_predict
#define vp8_recon_intra4x4_predict vp8_intra4x4_predict
#endif
extern prototype_intra4x4_predict\
(vp8_recon_intra4x4_predict);
typedef prototype_copy_block((*vp8_copy_block_fn_t));
typedef prototype_recon_block((*vp8_recon_fn_t));
typedef prototype_recon_macroblock((*vp8_recon_mb_fn_t));
typedef prototype_build_intra_predictors((*vp8_build_intra_pred_fn_t));
typedef prototype_intra4x4_predict((*vp8_intra4x4_pred_fn_t));
typedef struct vp8_recon_rtcd_vtable
{
vp8_copy_block_fn_t copy16x16;
......@@ -105,6 +127,9 @@ typedef struct vp8_recon_rtcd_vtable
vp8_recon_mb_fn_t recon_mby;
vp8_build_intra_pred_fn_t build_intra_predictors_mby_s;
vp8_build_intra_pred_fn_t build_intra_predictors_mby;
vp8_build_intra_pred_fn_t build_intra_predictors_mbuv_s;
vp8_build_intra_pred_fn_t build_intra_predictors_mbuv;
vp8_intra4x4_pred_fn_t intra4x4_predict;
} vp8_recon_rtcd_vtable_t;
#if CONFIG_RUNTIME_CPU_DETECT
......
......@@ -14,9 +14,4 @@
extern void init_intra_left_above_pixels(MACROBLOCKD *x);
extern void vp8_build_intra_predictors_mbuv(MACROBLOCKD *x);
extern void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x);
extern void vp8_predict_intra4x4(BLOCKD *x, int b_mode, unsigned char *Predictor);
#endif
......@@ -14,7 +14,7 @@
#include "vpx_mem/vpx_mem.h"
#include "reconintra.h"
void vp8_predict_intra4x4(BLOCKD *x,
void vp8_intra4x4_predict(BLOCKD *x,
int b_mode,
unsigned char *predictor)
{
......
......@@ -229,3 +229,411 @@ sym(vp8_copy_mem16x16_sse2):
UNSHADOW_ARGS
pop rbp
ret
;void vp8_intra_pred_uv_dc_mmx2(
; unsigned char *dst,
; int dst_stride
; unsigned char *src,
; int src_stride,
; )
global sym(vp8_intra_pred_uv_dc_mmx2)
sym(vp8_intra_pred_uv_dc_mmx2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
push rsi
push rdi
; end prolog
; from top
mov rsi, arg(2) ;src;
movsxd rax, dword ptr arg(3) ;src_stride;
sub rsi, rax
pxor mm0, mm0
movd mm1, [rsi]
movd mm2, [rsi+4]
punpcklbw mm1, mm0
punpcklbw mm2, mm0
paddw mm1, mm2
pshufw mm2, mm1, 0x0e
paddw mm1, mm2
pshufw mm2, mm1, 0x01
paddw mm1, mm2
; from left
dec rsi
lea rdi, [rax*3]
movzx ecx, byte [rsi+rax]
movzx edx, byte [rsi+rax*2]
add ecx, edx
movzx edx, byte [rsi+rdi]
add ecx, edx
lea rsi, [rsi+rax*4]
movzx edx, byte [rsi]
add ecx, edx
movzx edx, byte [rsi+rax]
add ecx, edx
movzx edx, byte [rsi+rax*2]
add ecx, edx
movzx edx, byte [rsi+rdi]
add ecx, edx
movzx edx, byte [rsi+rax*4]
add ecx, edx
; add up
pextrw edx, mm1, 0x0
lea edx, [edx+ecx+8]
sar edx, 4
movd mm1, edx
pshufw mm1, mm1, 0x0
packuswb mm1, mm1
; write out
mov rdi, arg(0) ;dst;
movsxd rcx, dword ptr arg(1) ;dst_stride
lea rax, [rcx*3]
movq [rdi ], mm1
movq [rdi+rcx ], mm1
movq [rdi+rcx*2], mm1
movq [rdi+rax ], mm1
lea rdi, [rdi+rcx*4]
movq [rdi ], mm1
movq [rdi+rcx ], mm1
movq [rdi+rcx*2], mm1
movq [rdi+rax ], mm1
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;void vp8_intra_pred_uv_dctop_mmx2(
; unsigned char *dst,
; int dst_stride
; unsigned char *src,
; int src_stride,
; )
global sym(vp8_intra_pred_uv_dctop_mmx2)
sym(vp8_intra_pred_uv_dctop_mmx2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
push rsi
push rdi
; end prolog
; from top
mov rsi, arg(2) ;src;
movsxd rax, dword ptr arg(3) ;src_stride;
sub rsi, rax
pxor mm0, mm0
movd mm1, [rsi]
movd mm2, [rsi+4]
punpcklbw mm1, mm0
punpcklbw mm2, mm0
paddw mm1, mm2
pshufw mm2, mm1, 0x0e
paddw mm1, mm2
pshufw mm2, mm1, 0x01
paddw mm1, mm2
; add up
paddw mm1, [GLOBAL(dc_4)]
psraw mm1, 3
pshufw mm1, mm1, 0x0
packuswb mm1, mm1
; write out
mov rdi, arg(0) ;dst;
movsxd rcx, dword ptr arg(1) ;dst_stride
lea rax, [rcx*3]
movq [rdi ], mm1
movq [rdi+rcx ], mm1
movq [rdi+rcx*2], mm1
movq [rdi+rax ], mm1
lea rdi, [rdi+rcx*4]
movq [rdi ], mm1
movq [rdi+rcx ], mm1
movq [rdi+rcx*2], mm1
movq [rdi+rax ], mm1
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;void vp8_intra_pred_uv_dcleft_mmx2(
; unsigned char *dst,
; int dst_stride
; unsigned char *src,
; int src_stride,
; )
global sym(vp8_intra_pred_uv_dcleft_mmx2)
sym(vp8_intra_pred_uv_dcleft_mmx2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
push rsi
push rdi
; end prolog
; from left
mov rsi, arg(2) ;src;
movsxd rax, dword ptr arg(3) ;src_stride;
dec rsi
lea rdi, [rax*3]
movzx ecx, byte [rsi]
movzx edx, byte [rsi+rax]
add ecx, edx
movzx edx, byte [rsi+rax*2]
add ecx, edx
movzx edx, byte [rsi+rdi]
add ecx, edx
lea rsi, [rsi+rax*4]
movzx edx, byte [rsi]
add ecx, edx
movzx edx, byte [rsi+rax]
add ecx, edx
movzx edx, byte [rsi+rax*2]
add ecx, edx
movzx edx, byte [rsi+rdi]
lea edx, [ecx+edx+4]
; add up
shr edx, 3
movd mm1, edx
pshufw mm1, mm1, 0x0
packuswb mm1, mm1
; write out
mov rdi, arg(0) ;dst;
movsxd rcx, dword ptr arg(1) ;dst_stride
lea rax, [rcx*3]
movq [rdi ], mm1
movq [rdi+rcx ], mm1
movq [rdi+rcx*2], mm1
movq [rdi+rax ], mm1
lea rdi, [rdi+rcx*4]
movq [rdi ], mm1
movq [rdi+rcx ], mm1
movq [rdi+rcx*2], mm1
movq [rdi+rax ], mm1
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;void vp8_intra_pred_uv_dc128_mmx(
; unsigned char *dst,
; int dst_stride
; unsigned char *src,
; int src_stride,
; )
global sym(vp8_intra_pred_uv_dc128_mmx)
sym(vp8_intra_pred_uv_dc128_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
; end prolog
; write out
movq mm1, [GLOBAL(dc_128)]
mov rax, arg(0) ;dst;
movsxd rdx, dword ptr arg(1) ;dst_stride
lea rcx, [rdx*3]
movq [rax ], mm1
movq [rax+rdx ], mm1
movq [rax+rdx*2], mm1
movq [rax+rcx ], mm1
lea rax, [rax+rdx*4]
movq [rax ], mm1
movq [rax+rdx ], mm1
movq [rax+rdx*2], mm1
movq [rax+rcx ], mm1
; begin epilog
UNSHADOW_ARGS
pop rbp
ret
;void vp8_intra_pred_uv_tm_sse2(
; unsigned char *dst,
; int dst_stride
; unsigned char *src,
; int src_stride,
; )
%macro vp8_intra_pred_uv_tm 1
global sym(vp8_intra_pred_uv_tm_%1)
sym(vp8_intra_pred_uv_tm_%1):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
push rsi
push rdi
; end prolog
; read top row
mov edx, 4
mov rsi, arg(2) ;src;
movsxd rax, dword ptr arg(3) ;src_stride;
sub rsi, rax
pxor xmm0, xmm0
%ifidn %1, ssse3
movdqa xmm2, [GLOBAL(dc_1024)]
%endif
movq xmm1, [rsi]
punpcklbw xmm1, xmm0
; set up left ptrs ans subtract topleft
movd xmm3, [rsi-1]
lea rsi, [rsi+rax-1]
%ifidn %1, sse2
punpcklbw xmm3, xmm0
pshuflw xmm3, xmm3, 0x0
punpcklqdq xmm3, xmm3
%else
pshufb xmm3, xmm2
%endif
psubw xmm1, xmm3
; set up dest ptrs
mov rdi, arg(0) ;dst;
movsxd rcx, dword ptr arg(1) ;dst_stride
vp8_intra_pred_uv_tm_%1_loop:
movd xmm3, [rsi]
movd xmm5, [rsi+rax]
%ifidn %1, sse2
punpcklbw xmm3, xmm0
punpcklbw xmm5, xmm0
pshuflw xmm3, xmm3, 0x0
pshuflw xmm5, xmm5, 0x0
punpcklqdq xmm3, xmm3
punpcklqdq xmm5, xmm5
%else
pshufb xmm3, xmm2
pshufb xmm5, xmm2
%endif
paddw xmm3, xmm1
paddw xmm5, xmm1
packuswb xmm3, xmm5
movq [rdi ], xmm3
movhps[rdi+rcx], xmm3
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rcx*2]
dec edx
jnz vp8_intra_pred_uv_tm_%1_loop
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
%endmacro
vp8_intra_pred_uv_tm sse2
vp8_intra_pred_uv_tm ssse3
;void vp8_intra_pred_uv_ve_mmx(
; unsigned char *dst,
; int dst_stride
; unsigned char *src,
; int src_stride,
; )
global sym(vp8_intra_pred_uv_ve_mmx)
sym(vp8_intra_pred_uv_ve_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
; end prolog
; read from top
mov rax, arg(2) ;src;
movsxd rdx, dword ptr arg(3) ;src_stride;
sub rax, rdx
movq mm1, [rax]
; write out
mov rax, arg(0) ;dst;
movsxd rdx, dword ptr arg(1) ;dst_stride
lea rcx, [rdx*3]
movq [rax ], mm1
movq [rax+rdx ], mm1
movq [rax+rdx*2], mm1
movq [rax+rcx ], mm1
lea rax, [rax+rdx*4]
movq [rax ], mm1
movq [rax+rdx ], mm1
movq [rax+rdx*2], mm1
movq [rax+rcx ], mm1
; begin epilog
UNSHADOW_ARGS
pop rbp
ret
;void vp8_intra_pred_uv_ho_mmx2(
; unsigned char *dst,
; int dst_stride
; unsigned char *src,
; int src_stride,
; )
global sym(vp8_intra_pred_uv_ho_mmx2)
sym(vp8_intra_pred_uv_ho_mmx2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
push rsi
push rdi
; end prolog
; read from left and write out
mov edx, 4
mov rsi, arg(2) ;src;
movsxd rax, dword ptr arg(3) ;src_stride;
mov rdi, arg(0) ;dst;
movsxd rcx, dword ptr arg(1) ;dst_stride
dec rsi
vp8_intra_pred_uv_ho_mmx2_loop:
movd mm0, [rsi]
movd mm1, [rsi+rax]
punpcklbw mm0, mm0
punpcklbw mm1, mm1
pshufw mm0, mm0, 0x0
pshufw mm1, mm1, 0x0
movq [rdi ], mm0
movq [rdi+rcx], mm1
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rcx*2]
dec edx
jnz vp8_intra_pred_uv_ho_mmx2_loop
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
SECTION_RODATA
dc_128:
times 8 db 128
dc_4:
times 4 dw 4
align 16
dc_1024:
times 8 dw 0x400
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_ports/config.h"
#include "vp8/common/recon.h"
#include "recon_x86.h"
#include "vpx_mem/vpx_mem.h"
#define build_intra_predictors_mbuv_prototype(sym) \
void sym(unsigned char *dst, int dst_stride, \
const unsigned char *src, int src_stride)
typedef build_intra_predictors_mbuv_prototype((*build_intra_predictors_mbuv_fn_t));
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc_mmx2);
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dctop_mmx2);
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dcleft_mmx2);
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc128_mmx);
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ho_mmx2);
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ve_mmx);
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_sse2);
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_ssse3);
static inline void vp8_build_intra_predictors_mbuv_x86(MACROBLOCKD *x,
unsigned char *dst_u,
unsigned char *dst_v,
int dst_stride,
build_intra_predictors_mbuv_fn_t tm_func)
{
int mode = x->mode_info_context->mbmi.uv_mode;
build_intra_predictors_mbuv_fn_t fn;
int src_stride = x->dst.uv_stride;
switch (mode) {
case V_PRED: fn = vp8_intra_pred_uv_ve_mmx; break;
case H_PRED: fn = vp8_intra_pred_uv_ho_mmx2; break;
case TM_PRED: fn = tm_func; break;
case DC_PRED:
if (x->up_available) {
if (x->left_available) {
fn = vp8_intra_pred_uv_dc_mmx2; break;
} else {
fn = vp8_intra_pred_uv_dctop_mmx2; break;
}
} else if (x->left_available) {
fn = vp8_intra_pred_uv_dcleft_mmx2; break;
} else {
fn = vp8_intra_pred_uv_dc128_mmx; break;
}
break;
default: return;
}
fn(dst_u, dst_stride, x->dst.u_buffer, src_stride);
fn(dst_v, dst_stride, x->dst.v_buffer, src_stride);
}
void vp8_build_intra_predictors_mbuv_sse2(MACROBLOCKD *x)
{
vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256],
&x->predictor[320], 8,
vp8_intra_pred_uv_tm_sse2);
}
void vp8_build_intra_predictors_mbuv_ssse3(MACROBLOCKD *x)
{
vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256],
&x->predictor[320], 8,
vp8_intra_pred_uv_tm_ssse3);
}
void vp8_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *x)
{
vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer,
x->dst.v_buffer, x->dst.uv_stride,
vp8_intra_pred_uv_tm_sse2);
}
void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x)
{
vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer,
x->dst.v_buffer, x->dst.uv_stride,
vp8_intra_pred_uv_tm_ssse3);
}
......@@ -46,6 +46,8 @@ extern prototype_copy_block(vp8_copy_mem16x16_mmx);
extern prototype_recon_block(vp8_recon2b_sse2);
extern prototype_recon_block(vp8_recon4b_sse2);
extern prototype_copy_block(vp8_copy_mem16x16_sse2);
extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_sse2);
extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_s_sse2);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_recon_recon2
......@@ -57,6 +59,26 @@ extern prototype_copy_block(vp8_copy_mem16x16_sse2);
#undef vp8_recon_copy16x16
#define vp8_recon_copy16x16 vp8_copy_mem16x16_sse2
#undef vp8_recon_build_intra_predictors_mbuv
#define vp8_recon_build_intra_predictors_mbuv vp8_build_intra_predictors_mbuv_sse2
#undef vp8_recon_build_intra_predictors_mbuv_s
#define vp8_recon_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_sse2
#endif
#endif
#if HAVE_SSSE3
extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_ssse3);