Skip to content
Snippets Groups Projects
Commit 77119a5c authored by Scott LaVarnway's avatar Scott LaVarnway Committed by Gerrit Code Review
Browse files

Merge "Improved sse2 version of simple loopfilter"

parents 5bfa29b6 1d7d18c6
No related branches found
No related tags found
No related merge requests found
......@@ -1385,52 +1385,54 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
SHADOW_ARGS_TO_STACK 3
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rcx, arg(0) ;src_ptr
movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
mov rdx, arg(2) ;blimit
movdqa xmm3, XMMWORD PTR [rdx]
mov rdi, rsi ; rdi points to row +1 for indirect addressing
add rdi, rax
lea rdx, [rcx + rax]
neg rax
; calculate mask
movdqa xmm1, [rsi+2*rax] ; p1
movdqa xmm0, [rdi] ; q1
movdqa xmm0, [rdx] ; q1
mov rdx, arg(2) ;blimit
movdqa xmm1, [rcx+2*rax] ; p1
movdqa xmm2, xmm1
movdqa xmm7, xmm0
movdqa xmm4, xmm0
psubusb xmm0, xmm1 ; q1-=p1
psubusb xmm1, xmm4 ; p1-=q1
psubusb xmm1, xmm7 ; p1-=q1
por xmm1, xmm0 ; abs(p1-q1)
pand xmm1, [GLOBAL(tfe)] ; set lsb of each byte to zero
psrlw xmm1, 1 ; abs(p1-q1)/2
movdqa xmm5, [rsi+rax] ; p0
movdqa xmm4, [rsi] ; q0
movdqa xmm3, XMMWORD PTR [rdx]
movdqa xmm5, [rcx+rax] ; p0
movdqa xmm4, [rcx] ; q0
movdqa xmm0, xmm4 ; q0
movdqa xmm6, xmm5 ; p0
psubusb xmm5, xmm4 ; p0-=q0
psubusb xmm4, xmm6 ; q0-=p0
por xmm5, xmm4 ; abs(p0 - q0)
movdqa xmm4, [GLOBAL(t80)]
paddusb xmm5, xmm5 ; abs(p0-q0)*2
paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2
psubusb xmm5, xmm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
pxor xmm3, xmm3
pcmpeqb xmm5, xmm3
; start work on filters
pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
pxor xmm2, xmm4 ; p1 offset to convert to signed values
pxor xmm7, xmm4 ; q1 offset to convert to signed values
psubsb xmm2, xmm7 ; p1 - q1
pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values
pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values
pxor xmm6, xmm4 ; offset to convert to signed values
pxor xmm0, xmm4 ; offset to convert to signed values
movdqa xmm3, xmm0 ; q0
psubsb xmm0, xmm6 ; q0 - p0
paddsb xmm2, xmm0 ; p1 - q1 + 1 * (q0 - p0)
......@@ -1438,42 +1440,36 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0)
pand xmm5, xmm2 ; mask filter values we don't care about
; do + 4 side
paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
movdqa xmm0, xmm5 ; get a copy of filters
psllw xmm0, 8 ; shift left 8
psraw xmm0, 3 ; arithmetic shift right 11
psrlw xmm0, 8
movdqa xmm1, xmm5 ; get a copy of filters
psraw xmm1, 11 ; arithmetic shift right 11
psllw xmm1, 8 ; shift left 8 to put it back
por xmm0, xmm1 ; put the two together to get result
paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
movdqa xmm0, xmm5
psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4
psubsb xmm3, xmm0 ; q0-= q0 add
pxor xmm3, [GLOBAL(t80)] ; unoffset
movdqa [rsi], xmm3 ; write back
movdqa xmm1, [GLOBAL(te0)]
movdqa xmm2, [GLOBAL(t1f)]
; now do +3 side
psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4
pxor xmm7, xmm7
pcmpgtb xmm7, xmm0 ;save sign
pand xmm7, xmm1 ;preserve the upper 3 bits
psrlw xmm0, 3
pand xmm0, xmm2 ;clear out upper 3 bits
por xmm0, xmm7 ;add sign
psubsb xmm3, xmm0 ; q0-= q0sz add
movdqa xmm0, xmm5 ; get a copy of filters
psllw xmm0, 8 ; shift left 8
psraw xmm0, 3 ; arithmetic shift right 11
psrlw xmm0, 8
psraw xmm5, 11 ; arithmetic shift right 11
psllw xmm5, 8 ; shift left 8 to put it back
por xmm0, xmm5 ; put the two together to get result
pxor xmm7, xmm7
pcmpgtb xmm7, xmm5 ;save sign
pand xmm7, xmm1 ;preserve the upper 3 bits
psrlw xmm5, 3
pand xmm5, xmm2 ;clear out upper 3 bits
por xmm5, xmm7 ;add sign
paddsb xmm6, xmm5 ; p0+= p0 add
pxor xmm3, xmm4 ; unoffset
movdqa [rcx], xmm3 ; write back
paddsb xmm6, xmm0 ; p0+= p0 add
pxor xmm6, [GLOBAL(t80)] ; unoffset
movdqa [rsi+rax], xmm6 ; write back
pxor xmm6, xmm4 ; unoffset
movdqa [rcx+rax], xmm6 ; write back
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
......@@ -1536,9 +1532,6 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
punpckldq xmm0, xmm1 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
punpckhdq xmm2, xmm1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
movdqa t0, xmm0 ; save to t0
movdqa t1, xmm2 ; save to t1
lea rsi, [rsi + rax*8]
lea rdi, [rsi + rax]
lea rdx, [rsi + rax*4]
......@@ -1551,26 +1544,24 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80
punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90
movd xmm0, [rsi + rax*2] ; a3 a2 a1 a0
movd xmm1, [rsi + rax*2] ; a3 a2 a1 a0
movd xmm5, [rdx + rax*2] ; e3 e2 e1 e0
movd xmm2, [rdi + rax*2] ; b3 b2 b1 b0
movd xmm3, [rdi + rax*2] ; b3 b2 b1 b0
movd xmm7, [rcx + rax*2] ; f3 f2 f1 f0
punpckldq xmm0, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0
punpckldq xmm2, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0
punpckldq xmm1, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0
punpckldq xmm3, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0
punpcklbw xmm4, xmm6 ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
punpcklbw xmm0, xmm2 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
punpcklbw xmm1, xmm3 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
movdqa xmm1, xmm4
punpcklwd xmm4, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
punpckhwd xmm1, xmm0 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
movdqa xmm7, xmm4
punpcklwd xmm4, xmm1 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
punpckhwd xmm7, xmm1 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
movdqa xmm6, xmm4
punpckldq xmm4, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
punpckhdq xmm6, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
punpckldq xmm4, xmm7 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
punpckhdq xmm6, xmm7 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
movdqa xmm0, t0 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
movdqa xmm2, t1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
movdqa xmm1, xmm0
movdqa xmm3, xmm2
......@@ -1579,6 +1570,8 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
punpcklqdq xmm2, xmm6 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
punpckhqdq xmm3, xmm6 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
mov rdx, arg(2) ;blimit
; calculate mask
movdqa xmm6, xmm0 ; p1
movdqa xmm7, xmm3 ; q1
......@@ -1588,6 +1581,8 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
pand xmm6, [GLOBAL(tfe)] ; set lsb of each byte to zero
psrlw xmm6, 1 ; abs(p1-q1)/2
movdqa xmm7, [rdx]
movdqa xmm5, xmm1 ; p0
movdqa xmm4, xmm2 ; q0
psubusb xmm5, xmm2 ; p0-=q0
......@@ -1596,8 +1591,7 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
paddusb xmm5, xmm5 ; abs(p0-q0)*2
paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2
mov rdx, arg(2) ;blimit
movdqa xmm7, XMMWORD PTR [rdx]
movdqa xmm4, [GLOBAL(t80)]
psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
pxor xmm7, xmm7
......@@ -1607,59 +1601,48 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
movdqa t0, xmm0
movdqa t1, xmm3
pxor xmm0, [GLOBAL(t80)] ; p1 offset to convert to signed values
pxor xmm3, [GLOBAL(t80)] ; q1 offset to convert to signed values
pxor xmm0, xmm4 ; p1 offset to convert to signed values
pxor xmm3, xmm4 ; q1 offset to convert to signed values
psubsb xmm0, xmm3 ; p1 - q1
movdqa xmm6, xmm1 ; p0
movdqa xmm7, xmm2 ; q0
pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values
pxor xmm7, [GLOBAL(t80)] ; offset to convert to signed values
movdqa xmm3, xmm7 ; offseted ; q0
psubsb xmm7, xmm6 ; q0 - p0
paddsb xmm0, xmm7 ; p1 - q1 + 1 * (q0 - p0)
movdqa xmm6, xmm1 ; p0
; movdqa xmm7, xmm2 ; q0
paddsb xmm0, xmm7 ; p1 - q1 + 2 * (q0 - p0)
paddsb xmm0, xmm7 ; p1 - q1 + 3 * (q0 - p0)
pxor xmm6, xmm4 ; offset to convert to signed values
pxor xmm2, xmm4 ; offset to convert to signed values
movdqa xmm3, xmm2 ; offseted ; q0
psubsb xmm2, xmm6 ; q0 - p0
paddsb xmm0, xmm2 ; p1 - q1 + 1 * (q0 - p0)
paddsb xmm0, xmm2 ; p1 - q1 + 2 * (q0 - p0)
paddsb xmm0, xmm2 ; p1 - q1 + 3 * (q0 - p0)
pand xmm5, xmm0 ; mask filter values we don't care about
paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
movdqa xmm0, xmm5 ; get a copy of filters
psllw xmm0, 8 ; shift left 8
psraw xmm0, 3 ; arithmetic shift right 11
psrlw xmm0, 8
movdqa xmm7, xmm5 ; get a copy of filters
psraw xmm7, 11 ; arithmetic shift right 11
psllw xmm7, 8 ; shift left 8 to put it back
por xmm0, xmm7 ; put the two together to get result
psubsb xmm3, xmm0 ; q0-= q0sz add
pxor xmm3, [GLOBAL(t80)] ; unoffset q0
; now do +3 side
movdqa xmm0, xmm5
psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4
movdqa xmm0, xmm5 ; get a copy of filters
psllw xmm0, 8 ; shift left 8
psraw xmm0, 3 ; arithmetic shift right 11
movdqa xmm1, [GLOBAL(te0)]
movdqa xmm2, [GLOBAL(t1f)]
psrlw xmm0, 8
psraw xmm5, 11 ; arithmetic shift right 11
pxor xmm7, xmm7
pcmpgtb xmm7, xmm0 ;save sign
pand xmm7, xmm1 ;preserve the upper 3 bits
psrlw xmm0, 3
pand xmm0, xmm2 ;clear out upper 3 bits
por xmm0, xmm7 ;add sign
psubsb xmm3, xmm0 ; q0-= q0sz add
psllw xmm5, 8 ; shift left 8 to put it back
por xmm0, xmm5 ; put the two together to get result
pxor xmm7, xmm7
pcmpgtb xmm7, xmm5 ;save sign
pand xmm7, xmm1 ;preserve the upper 3 bits
psrlw xmm5, 3
pand xmm5, xmm2 ;clear out upper 3 bits
por xmm5, xmm7 ;add sign
paddsb xmm6, xmm5 ; p0+= p0 add
paddsb xmm6, xmm0 ; p0+= p0 add
pxor xmm6, [GLOBAL(t80)] ; unoffset p0
pxor xmm3, xmm4 ; unoffset q0
pxor xmm6, xmm4 ; unoffset p0
movdqa xmm0, t0 ; p1
movdqa xmm4, t1 ; q1
......@@ -1763,3 +1746,9 @@ s9:
align 16
s63:
times 8 dw 0x003f
align 16
te0:
times 16 db 0xe0
align 16
t1f:
times 16 db 0x1f
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment