Commit 85358d04 authored by Johann's avatar Johann
Browse files

Fix data accesses for simple loopfilters

The data that the simple horizontal loopfilter reads is aligned, treat
it accordingly.

For the vertical, we only use the bottom 4 bytes, so don't read in 16
(and incur the penalty for unaligned access).

This shows a small improvement on older processors which have a
significant penalty for unaligned reads.

postproc_mmx.c is unused

Change-Id: I87b29bbc0c3b19ee1ca1de3c4f47332a53087b3d
parent c5f890af
......@@ -1395,8 +1395,8 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
neg rax
; calculate mask
movdqu xmm1, [rsi+2*rax] ; p1
movdqu xmm0, [rdi] ; q1
movdqa xmm1, [rsi+2*rax] ; p1
movdqa xmm0, [rdi] ; q1
movdqa xmm2, xmm1
movdqa xmm7, xmm0
movdqa xmm4, xmm0
......@@ -1406,8 +1406,8 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
pand xmm1, [GLOBAL(tfe)] ; set lsb of each byte to zero
psrlw xmm1, 1 ; abs(p1-q1)/2
movdqu xmm5, [rsi+rax] ; p0
movdqu xmm4, [rsi] ; q0
movdqa xmm5, [rsi+rax] ; p0
movdqa xmm4, [rsi] ; q0
movdqa xmm0, xmm4 ; q0
movdqa xmm6, xmm5 ; p0
psubusb xmm5, xmm4 ; p0-=q0
......@@ -1449,7 +1449,7 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
psubsb xmm3, xmm0 ; q0-= q0 add
pxor xmm3, [GLOBAL(t80)] ; unoffset
movdqu [rsi], xmm3 ; write back
movdqa [rsi], xmm3 ; write back
; now do +3 side
psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4
......@@ -1465,7 +1465,7 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
paddsb xmm6, xmm0 ; p0+= p0 add
pxor xmm6, [GLOBAL(t80)] ; unoffset
movdqu [rsi+rax], xmm6 ; write back
movdqa [rsi+rax], xmm6 ; write back
; begin epilog
pop rdi
......@@ -1507,17 +1507,17 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
lea rdx, [rsi + rax*4]
lea rcx, [rdx + rax]
movdqu xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00
movdqu xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40
movdqu xmm2, [rdi] ; 13 12 11 10
movdqu xmm3, [rcx] ; 53 52 51 50
movd xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00
movd xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40
movd xmm2, [rdi] ; 13 12 11 10
movd xmm3, [rcx] ; 53 52 51 50
punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00
punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10
movdqu xmm4, [rsi + rax*2] ; 23 22 21 20
movdqu xmm5, [rdx + rax*2] ; 63 62 61 60
movdqu xmm6, [rdi + rax*2] ; 33 32 31 30
movdqu xmm7, [rcx + rax*2] ; 73 72 71 70
movd xmm4, [rsi + rax*2] ; 23 22 21 20
movd xmm5, [rdx + rax*2] ; 63 62 61 60
movd xmm6, [rdi + rax*2] ; 33 32 31 30
movd xmm7, [rcx + rax*2] ; 73 72 71 70
punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20
punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30
......@@ -1540,17 +1540,17 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
lea rdx, [rsi + rax*4]
lea rcx, [rdx + rax]
movdqu xmm4, [rsi] ; 83 82 81 80
movdqu xmm1, [rdx] ; c3 c2 c1 c0
movdqu xmm6, [rdi] ; 93 92 91 90
movdqu xmm3, [rcx] ; d3 d2 d1 d0
movd xmm4, [rsi] ; 83 82 81 80
movd xmm1, [rdx] ; c3 c2 c1 c0
movd xmm6, [rdi] ; 93 92 91 90
movd xmm3, [rcx] ; d3 d2 d1 d0
punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80
punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90
movdqu xmm0, [rsi + rax*2] ; a3 a2 a1 a0
movdqu xmm5, [rdx + rax*2] ; e3 e2 e1 e0
movdqu xmm2, [rdi + rax*2] ; b3 b2 b1 b0
movdqu xmm7, [rcx + rax*2] ; f3 f2 f1 f0
movd xmm0, [rsi + rax*2] ; a3 a2 a1 a0
movd xmm5, [rdx + rax*2] ; e3 e2 e1 e0
movd xmm2, [rdi + rax*2] ; b3 b2 b1 b0
movd xmm7, [rcx + rax*2] ; f3 f2 f1 f0
punpckldq xmm0, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0
punpckldq xmm2, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment