Commit b07e5b6f authored by Scott LaVarnway's avatar Scott LaVarnway

Finished vp8_sixtap_predict4x4_ssse3 function

Added vp8_filter_block1d4_h6_ssse3 and vp8_filter_block1d4_v6_ssse3
assembly routines.  Also removed unused assembly.

Change-Id: I01c1021835f2edda9da706822345f217087ca0d0
parent 99f46d62
...@@ -316,21 +316,21 @@ sym(vp8_filter_block1d4_h6_ssse3): ...@@ -316,21 +316,21 @@ sym(vp8_filter_block1d4_h6_ssse3):
; end prolog ; end prolog
movsxd rdx, DWORD PTR arg(5) ;table index movsxd rdx, DWORD PTR arg(5) ;table index
mov rsi, arg(0) ;src_ptr xor rsi, rsi
shl rdx, 4 ; shl rdx, 4 ;
lea rax, [k0_k5 GLOBAL] lea rax, [k0_k5 GLOBAL]
add rax, rdx add rax, rdx
movdqa xmm7, [rd GLOBAL] movdqa xmm7, [rd GLOBAL]
cmp esi, DWORD PTR [rax]
je vp8_filter_block1d4_h4_ssse3
movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;output_ptr mov rdi, arg(2) ;output_ptr
movsxd rax, dword ptr arg(1) ;src_pixels_per_line movsxd rax, dword ptr arg(1) ;src_pixels_per_line
movsxd rcx, dword ptr arg(4) ;output_height movsxd rcx, dword ptr arg(4) ;output_height
...@@ -362,10 +362,8 @@ filter_block1d4_h6_rowloop_ssse3: ...@@ -362,10 +362,8 @@ filter_block1d4_h6_rowloop_ssse3:
psraw xmm0, 7 psraw xmm0, 7
packuswb xmm0, xmm0 packuswb xmm0, xmm0
; movd DWORD PTR [rdi], xmm0
punpcklbw xmm0, xmm1
movq MMWORD PTR [rdi], xmm0
add rdi, rdx add rdi, rdx
dec rcx dec rcx
jnz filter_block1d4_h6_rowloop_ssse3 jnz filter_block1d4_h6_rowloop_ssse3
...@@ -378,6 +376,53 @@ filter_block1d4_h6_rowloop_ssse3: ...@@ -378,6 +376,53 @@ filter_block1d4_h6_rowloop_ssse3:
pop rbp pop rbp
ret ret
vp8_filter_block1d4_h4_ssse3:
movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
movdqa xmm0, XMMWORD PTR [shuf2b GLOBAL]
movdqa xmm3, XMMWORD PTR [shuf3b GLOBAL]
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;output_ptr
movsxd rax, dword ptr arg(1) ;src_pixels_per_line
movsxd rcx, dword ptr arg(4) ;output_height
movsxd rdx, dword ptr arg(3) ;output_pitch
filter_block1d4_h4_rowloop_ssse3:
movdqu xmm1, XMMWORD PTR [rsi - 2]
movdqa xmm2, xmm1
pshufb xmm1, xmm0 ;;[shuf2b GLOBAL]
pshufb xmm2, xmm3 ;;[shuf3b GLOBAL]
pmaddubsw xmm1, xmm5
;--
pmaddubsw xmm2, xmm6
lea rsi, [rsi + rax]
;--
paddsw xmm1, xmm7
paddsw xmm1, xmm2
psraw xmm1, 7
packuswb xmm1, xmm1
movd DWORD PTR [rdi], xmm1
add rdi, rdx
dec rcx
jnz filter_block1d4_h4_rowloop_ssse3
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
;void vp8_filter_block1d16_v6_ssse3 ;void vp8_filter_block1d16_v6_ssse3
;( ;(
; unsigned char *src_ptr, ; unsigned char *src_ptr,
...@@ -700,81 +745,88 @@ vp8_filter_block1d8_v4_ssse3_loop: ...@@ -700,81 +745,88 @@ vp8_filter_block1d8_v4_ssse3_loop:
UNSHADOW_ARGS UNSHADOW_ARGS
pop rbp pop rbp
ret ret
;void vp8_filter_block1d4_v6_ssse3
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ;(
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ; unsigned char *src_ptr,
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ; unsigned int src_pitch,
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ; unsigned char *output_ptr,
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ; unsigned int out_pitch,
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ; unsigned int output_height,
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ; unsigned int vp8_filter_index
global sym(vp8_filter_block1d8_h6_ssse3_slow) ;)
sym(vp8_filter_block1d8_h6_ssse3_slow): global sym(vp8_filter_block1d4_v6_ssse3)
sym(vp8_filter_block1d4_v6_ssse3):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
SHADOW_ARGS_TO_STACK 7 SHADOW_ARGS_TO_STACK 6
GET_GOT rbx GET_GOT rbx
push rsi push rsi
push rdi push rdi
; end prolog ; end prolog
mov rdx, arg(6) ;vp8_filter movsxd rdx, DWORD PTR arg(5) ;table index
mov rsi, arg(0) ;src_ptr xor rsi, rsi
shl rdx, 4 ;
mov rdi, arg(1) ;output_ptr
movsxd rcx, dword ptr arg(4) ;output_height
movsxd rax, dword ptr arg(2) ;src_pixels_per_line
movq xmm7, [rdx]
pxor xmm4, xmm4
movdqa xmm5, XMMWORD PTR [shuf1 GLOBAL]
movdqa xmm6, XMMWORD PTR [shuf2 GLOBAL]
movsxd rdx, dword ptr arg(5) ;output_width
punpcklqdq xmm7, xmm7 ;copy filter constants to upper 8 bytes lea rax, [k0_k5 GLOBAL]
add rax, rdx
filter_block1d8_h6_rowloop3_slow: movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
movdqu xmm0, XMMWORD PTR [rsi - 2] mov rdi, arg(2) ;output_ptr
%if ABI_IS_32BIT=0
movsxd r8, DWORD PTR arg(3) ; out_pitch
%endif
movsxd rcx, DWORD PTR arg(4) ;[output_height]
lea rsi, [rsi + rax] cmp esi, DWORD PTR [rax]
je vp8_filter_block1d4_v4_ssse3
movdqa xmm1, xmm0 movq mm5, MMWORD PTR [rax] ;k0_k5
pshufb xmm0, XMMWORD PTR [shuf1 GLOBAL] movq mm6, MMWORD PTR [rax+256] ;k2_k4
movq mm7, MMWORD PTR [rax+128] ;k1_k3
movdqa xmm2, xmm1 mov rsi, arg(0) ;src_ptr
pmaddubsw xmm0, xmm7
pshufb xmm1, XMMWORD PTR [shuf2 GLOBAL]
movdqa xmm3, xmm2 mov rax, rsi
pmaddubsw xmm1, xmm7 add rax, rdx
pshufb xmm2, XMMWORD PTR [shuf3 GLOBAL]
pshufb xmm3, XMMWORD PTR [shuf4 GLOBAL] vp8_filter_block1d4_v6_ssse3_loop:
movd mm1, DWORD PTR [rsi] ;A
movd mm2, DWORD PTR [rsi + rdx] ;B
movd mm3, DWORD PTR [rsi + rdx * 2] ;C
movd mm4, DWORD PTR [rax + rdx * 2] ;D
movd mm0, DWORD PTR [rsi + rdx * 4] ;E
pmaddubsw xmm2, xmm7 punpcklbw mm2, mm4 ;B D
pmaddubsw xmm3, xmm7 punpcklbw mm3, mm0 ;C E
;4 cycles
phaddsw xmm0, xmm1 movd mm0, DWORD PTR [rax + rdx * 4] ;F
phaddsw xmm2, xmm3
;7 cycles
phaddsw xmm0, xmm2
;7 cycles
movq mm4, [rd GLOBAL]
paddsw xmm0, [rd GLOBAL] pmaddubsw mm3, mm6
psraw xmm0, 7 punpcklbw mm1, mm0 ;A F
packuswb xmm0, xmm0 pmaddubsw mm2, mm7
pmaddubsw mm1, mm5
add rsi, rdx
add rax, rdx
;--
;--
paddsw mm2, mm3
paddsw mm2, mm1
paddsw mm2, mm4
psraw mm2, 7
packuswb mm2, mm2
; movd DWORD PTR [rdi], mm2
punpcklbw xmm0, xmm4
movdqa XMMWORD Ptr [rdi], xmm0 %if ABI_IS_32BIT
add rdi, rdx add rdi, DWORD PTR arg(3) ;[out_pitch]
%else
add rdi, r8
%endif
dec rcx dec rcx
jnz filter_block1d8_h6_rowloop3_slow ; next row jnz vp8_filter_block1d4_v6_ssse3_loop
; begin epilog ; begin epilog
pop rdi pop rdi
...@@ -783,111 +835,46 @@ filter_block1d8_h6_rowloop3_slow: ...@@ -783,111 +835,46 @@ filter_block1d8_h6_rowloop3_slow:
UNSHADOW_ARGS UNSHADOW_ARGS
pop rbp pop rbp
ret ret
;void vp8_filter_block1d16_h6_ssse3
;(
; unsigned char *src_ptr,
; unsigned short *output_ptr,
; unsigned int src_pixels_per_line,
; unsigned int pixel_step,
; unsigned int output_height,
; unsigned int output_width,
; short *vp8_filter
;)
global sym(vp8_filter_block1d16_h6_ssse3_slow)
sym(vp8_filter_block1d16_h6_ssse3_slow):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM
GET_GOT rbx
push rsi
push rdi
; end prolog
mov rdx, arg(6) ;vp8_filter
mov rsi, arg(0) ;src_ptr
mov rdi, arg(1) ;output_ptr
movsxd rcx, dword ptr arg(4) ;output_height
movsxd rax, dword ptr arg(2) ;src_pixels_per_line
movq xmm7, [rdx]
pxor xmm4, xmm4
movdqa xmm5, XMMWORD PTR [shuf1 GLOBAL]
movdqa xmm6, XMMWORD PTR [shuf2 GLOBAL]
movsxd rdx, dword ptr arg(5) ;output_width vp8_filter_block1d4_v4_ssse3:
movq mm6, MMWORD PTR [rax+256] ;k2_k4
movq mm7, MMWORD PTR [rax+128] ;k1_k3
movq mm5, MMWORD PTR [rd GLOBAL]
punpcklqdq xmm7, xmm7 ;copy filter constants to upper 8 bytes mov rsi, arg(0) ;src_ptr
sub rdi, rdx
filter_block1d16_h6_rowloop3_slow:
movdqu xmm0, XMMWORD PTR [rsi - 2]
movdqa xmm1, xmm0
pshufb xmm0, xmm5
movdqa xmm2, xmm1 mov rax, rsi
pmaddubsw xmm0, xmm7 add rax, rdx
pshufb xmm1, xmm6
movdqa xmm3, xmm2
pmaddubsw xmm1, xmm7
pshufb xmm2, XMMWORD PTR [shuf3 GLOBAL]
movdqu xmm4, XMMWORD PTR [rsi + 6]
pshufb xmm3, XMMWORD PTR [shuf4 GLOBAL]
lea rsi, [rsi + rax]
pmaddubsw xmm2, xmm7
phaddsw xmm0, xmm1
pmaddubsw xmm3, xmm7
movdqa xmm1, xmm4
pshufb xmm4, xmm5
movdqa xmm5, xmm1
pmaddubsw xmm4, xmm7
pshufb xmm1, xmm6
phaddsw xmm2, xmm3
pmaddubsw xmm1, xmm7
movdqa xmm3, xmm5
pshufb xmm5, XMMWORD PTR [shuf3 GLOBAL]
add rdi, rdx
pmaddubsw xmm5, xmm7
pshufb xmm3, XMMWORD PTR [shuf4 GLOBAL]
phaddsw xmm4, xmm1
dec rcx
phaddsw xmm0, xmm2
pmaddubsw xmm3, xmm7
vp8_filter_block1d4_v4_ssse3_loop:
movd mm2, DWORD PTR [rsi + rdx] ;B
movd mm3, DWORD PTR [rsi + rdx * 2] ;C
movd mm4, DWORD PTR [rax + rdx * 2] ;D
movd mm0, DWORD PTR [rsi + rdx * 4] ;E
paddsw xmm0, [rd GLOBAL] punpcklbw mm2, mm4 ;B D
psraw xmm0, 7 punpcklbw mm3, mm0 ;C E
packuswb xmm0, xmm0
phaddsw xmm5, xmm3
pxor xmm3, xmm3
punpcklbw xmm0, xmm3
;--
;--
;--
;--
phaddsw xmm4, xmm5 pmaddubsw mm3, mm6
movdqa xmm5, XMMWORD PTR [shuf1 GLOBAL] pmaddubsw mm2, mm7
movdqa XMMWORD Ptr [rdi], xmm0 add rsi, rdx
;-- add rax, rdx
;--
;--
;-- ;--
;-- ;--
paddsw xmm4, [rd GLOBAL] paddsw mm2, mm3
psraw xmm4, 7 paddsw mm2, mm5
packuswb xmm4, xmm4 psraw mm2, 7
; packuswb mm2, mm2
punpcklbw xmm4, xmm3
movdqa XMMWORD Ptr [rdi+16], xmm4 movd DWORD PTR [rdi], mm2
jnz filter_block1d16_h6_rowloop3_slow ; next row
%if ABI_IS_32BIT
add rdi, DWORD PTR arg(3) ;[out_pitch]
%else
add rdi, r8
%endif
dec rcx
jnz vp8_filter_block1d4_v4_ssse3_loop
; begin epilog ; begin epilog
pop rdi pop rdi
...@@ -899,22 +886,6 @@ filter_block1d16_h6_rowloop3_slow: ...@@ -899,22 +886,6 @@ filter_block1d16_h6_rowloop3_slow:
SECTION_RODATA SECTION_RODATA
align 16 align 16
shuf1:
db 0, 1, 2, 4, 3, 5, 128, 128, 1, 2, 3, 5, 4, 6, 128, 128
shuf2:
db 2, 3, 4, 6, 5, 7, 128, 128, 3, 4, 5, 7, 6, 8, 128, 128
shuf3:
db 4, 5, 6, 8, 7, 9, 128, 128, 5, 6, 7, 9, 8, 10, 128, 128
shuf4:
db 6, 7, 8, 10, 9, 11, 128, 128, 7, 8, 9, 11, 10, 12, 128, 128
shuf1a:
db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
shuf2a:
db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
shuf3a:
db 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12
shuf1b: shuf1b:
db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
shuf2b: shuf2b:
......
...@@ -104,8 +104,8 @@ extern prototype_subpixel_predict(vp8_sixtap_predict4x4_ssse3); ...@@ -104,8 +104,8 @@ extern prototype_subpixel_predict(vp8_sixtap_predict4x4_ssse3);
#undef vp8_subpix_sixtap8x4 #undef vp8_subpix_sixtap8x4
#define vp8_subpix_sixtap8x4 vp8_sixtap_predict8x4_ssse3 #define vp8_subpix_sixtap8x4 vp8_sixtap_predict8x4_ssse3
//#undef vp8_subpix_sixtap4x4 #undef vp8_subpix_sixtap4x4
//#define vp8_subpix_sixtap4x4 vp8_sixtap_predict4x4_ssse3 #define vp8_subpix_sixtap4x4 vp8_sixtap_predict4x4_ssse3
//#undef vp8_subpix_bilinear16x16 //#undef vp8_subpix_bilinear16x16
......
...@@ -402,6 +402,26 @@ extern void vp8_filter_block1d8_v6_ssse3 ...@@ -402,6 +402,26 @@ extern void vp8_filter_block1d8_v6_ssse3
unsigned int vp8_filter_index unsigned int vp8_filter_index
); );
extern void vp8_filter_block1d4_h6_ssse3
(
unsigned char *src_ptr,
unsigned int src_pixels_per_line,
unsigned char *output_ptr,
unsigned int output_pitch,
unsigned int output_height,
unsigned int vp8_filter_index
);
extern void vp8_filter_block1d4_v6_ssse3
(
unsigned char *src_ptr,
unsigned int src_pitch,
unsigned char *output_ptr,
unsigned int out_pitch,
unsigned int output_height,
unsigned int vp8_filter_index
);
void vp8_sixtap_predict16x16_ssse3 void vp8_sixtap_predict16x16_ssse3
( (
unsigned char *src_ptr, unsigned char *src_ptr,
...@@ -509,21 +529,24 @@ void vp8_sixtap_predict4x4_ssse3 ...@@ -509,21 +529,24 @@ void vp8_sixtap_predict4x4_ssse3
int dst_pitch int dst_pitch
) )
{ {
DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 16*16); DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 4*9);
if (xoffset) if (xoffset)
{ {
if (yoffset) if (yoffset)
{ {
vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 4, 9, xoffset);
} vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, 4, yoffset);
else }
{ else
} {
} vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, xoffset);
else }
{ }
} else
{
vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, yoffset);
}
} }
......
...@@ -123,7 +123,7 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx) ...@@ -123,7 +123,7 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_ssse3; rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_ssse3;
rtcd->subpix.sixtap8x8 = vp8_sixtap_predict8x8_ssse3; rtcd->subpix.sixtap8x8 = vp8_sixtap_predict8x8_ssse3;
rtcd->subpix.sixtap8x4 = vp8_sixtap_predict8x4_ssse3; rtcd->subpix.sixtap8x4 = vp8_sixtap_predict8x4_ssse3;
// rtcd->subpix.sixtap4x4 = vp8_sixtap_predict4x4_ssse3; rtcd->subpix.sixtap4x4 = vp8_sixtap_predict4x4_ssse3;
} }
#endif #endif
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment