Commit 439b2ecd authored by Fritz Koenig's avatar Fritz Koenig Committed by Code Review
Browse files

Merge "Optimizations on the loopfilters."

parents 7288cdf7 0964ef0e
......@@ -14,175 +14,172 @@
; Use of pmaxub instead of psubusb to compute filter mask was seen
; in ffvp8
%macro LFH_FILTER_MASK 1
%macro LFH_FILTER_AND_HEV_MASK 1
%if %1
movdqa xmm2, [rdi+2*rax] ; q3
movdqa xmm1, [rsi+2*rax] ; q2
movdqa xmm4, [rsi+rax] ; q1
movdqa xmm5, [rsi] ; q0
neg rax ; negate pitch to deal with above border
%else
movq xmm0, [rsi + rcx*2] ; q3
movq xmm2, [rdi + rcx*2]
pslldq xmm2, 8
por xmm2, xmm0
movq xmm1, [rsi + rcx] ; q2
movq xmm3, [rdi + rcx]
pslldq xmm3, 8
por xmm1, xmm3
movlps xmm2, [rsi + rcx*2] ; q3
movlps xmm1, [rsi + rcx] ; q2
movlps xmm4, [rsi] ; q1
movlps xmm5, [rsi + rax] ; q0
movhps xmm2, [rdi + rcx*2]
movhps xmm1, [rdi + rcx]
movhps xmm4, [rdi]
movhps xmm5, [rdi + rax]
lea rsi, [rsi + rax*4]
lea rdi, [rdi + rax*4]
movdqa XMMWORD PTR [rsp], xmm1 ; store q2
movdqa XMMWORD PTR [rsp + 16], xmm4 ; store q1
%endif
movdqa xmm6, xmm1 ; q2
movdqa xmm3, xmm4 ; q1
psubusb xmm1, xmm2 ; q2-=q3
psubusb xmm2, xmm6 ; q3-=q2
por xmm1, xmm2 ; abs(q3-q2)
%if %1
movdqa xmm4, [rsi+rax] ; q1
%else
movq xmm0, [rsi] ; q1
movq xmm4, [rdi]
pslldq xmm4, 8
por xmm4, xmm0
movdqa XMMWORD PTR [rsp + 16], xmm4 ; store q1
%endif
movdqa xmm3, xmm4 ; q1
psubusb xmm4, xmm6 ; q1-=q2
psubusb xmm6, xmm3 ; q2-=q1
por xmm4, xmm6 ; abs(q2-q1)
pmaxub xmm1, xmm4
por xmm1, xmm2 ; abs(q3-q2)
%if %1
movdqa xmm4, [rsi] ; q0
%else
movq xmm4, [rsi + rax] ; q0
movq xmm0, [rdi + rax]
pslldq xmm0, 8
por xmm4, xmm0
%endif
movdqa xmm0, xmm5 ; q0
pmaxub xmm1, xmm4
movdqa xmm0, xmm4 ; q0
psubusb xmm4, xmm3 ; q0-=q1
psubusb xmm5, xmm3 ; q0-=q1
psubusb xmm3, xmm0 ; q1-=q0
por xmm4, xmm3 ; abs(q0-q1)
movdqa t0, xmm4 ; save to t0
pmaxub xmm1, xmm4
%if %1
neg rax ; negate pitch to deal with above border
por xmm5, xmm3 ; abs(q0-q1)
movdqa t0, xmm5 ; save to t0
pmaxub xmm1, xmm5
%if %1
movdqa xmm2, [rsi+4*rax] ; p3
movdqa xmm4, [rdi+4*rax] ; p2
movdqa xmm6, [rsi+2*rax] ; p1
%else
lea rsi, [rsi + rax*4]
lea rdi, [rdi + rax*4]
movlps xmm2, [rsi + rax] ; p3
movlps xmm4, [rsi] ; p2
movlps xmm6, [rsi + rcx] ; p1
movhps xmm2, [rdi + rax]
movhps xmm4, [rdi]
movhps xmm6, [rdi + rcx]
movq xmm2, [rsi + rax] ; p3
movq xmm3, [rdi + rax]
pslldq xmm3, 8
por xmm2, xmm3
movq xmm4, [rsi] ; p2
movq xmm5, [rdi]
pslldq xmm5, 8
por xmm4, xmm5
movdqa XMMWORD PTR [rsp + 32], xmm4 ; store p2
movdqa XMMWORD PTR [rsp + 48], xmm6 ; store p1
%endif
movdqa xmm5, xmm4 ; p2
movdqa xmm3, xmm6 ; p1
psubusb xmm4, xmm2 ; p2-=p3
psubusb xmm2, xmm5 ; p3-=p2
por xmm4, xmm2 ; abs(p3 - p2)
pmaxub xmm1, xmm4
%if %1
movdqa xmm4, [rsi+2*rax] ; p1
%else
movq xmm4, [rsi + rcx] ; p1
movq xmm3, [rdi + rcx]
pslldq xmm3, 8
por xmm4, xmm3
movdqa XMMWORD PTR [rsp + 48], xmm4 ; store p1
%endif
psubusb xmm3, xmm5 ; p1-=p2
pmaxub xmm1, xmm4 ; abs(p3 - p2)
movdqa xmm3, xmm4 ; p1
psubusb xmm4, xmm5 ; p1-=p2
psubusb xmm5, xmm3 ; p2-=p1
por xmm4, xmm5 ; abs(p2 - p1)
pmaxub xmm1, xmm4
psubusb xmm5, xmm6 ; p2-=p1
pmaxub xmm1, xmm2 ; abs(p3 - p2)
movdqa xmm2, xmm3 ; p1
pmaxub xmm1, xmm5 ; abs(p2 - p1)
movdqa xmm2, xmm6 ; p1
pmaxub xmm1, xmm3 ; abs(p2 - p1)
%if %1
movdqa xmm4, [rsi+rax] ; p0
movdqa xmm3, [rdi] ; q1
%else
movq xmm4, [rsi + rcx*2] ; p0
movq xmm5, [rdi + rcx*2]
pslldq xmm5, 8
por xmm4, xmm5
movlps xmm4, [rsi + rcx*2] ; p0
movhps xmm4, [rdi + rcx*2]
movdqa xmm3, q1 ; q1
%endif
movdqa xmm5, xmm4 ; p0
psubusb xmm4, xmm3 ; p0-=p1
psubusb xmm3, xmm5 ; p1-=p0
por xmm4, xmm3 ; abs(p1 - p0)
movdqa t1, xmm4 ; save to t1
psubusb xmm4, xmm6 ; p0-=p1
pmaxub xmm1, xmm4
psubusb xmm1, xmm7
psubusb xmm6, xmm5 ; p1-=p0
%if %1
movdqa xmm3, [rdi] ; q1
%else
movdqa xmm3, q1 ; q1
%endif
por xmm6, xmm4 ; abs(p1 - p0)
mov rdx, arg(2) ; get flimit
movdqa t1, xmm6 ; save to t1
movdqa xmm4, xmm3 ; q1
pmaxub xmm1, xmm6
psubusb xmm3, xmm2 ; q1-=p1
psubusb xmm2, xmm4 ; p1-=q1
psubusb xmm1, xmm7
por xmm2, xmm3 ; abs(p1-q1)
movdqa xmm4, XMMWORD PTR [rdx] ; flimit
movdqa xmm3, xmm0 ; q0
pand xmm2, [tfe GLOBAL] ; set lsb of each byte to zero
psrlw xmm2, 1 ; abs(p1-q1)/2
mov rdx, arg(4) ; hev get thresh
movdqa xmm6, xmm5 ; p0
movdqa xmm3, xmm0 ; q0
psrlw xmm2, 1 ; abs(p1-q1)/2
psubusb xmm5, xmm3 ; p0-=q0
paddb xmm4, xmm4 ; flimit*2 (less than 255)
psubusb xmm3, xmm6 ; q0-=p0
por xmm5, xmm3 ; abs(p0 - q0)
paddusb xmm5, xmm5 ; abs(p0-q0)*2
paddb xmm7, xmm4 ; flimit * 2 + limit (less than 255)
movdqa xmm4, t0 ; hev get abs (q1 - q0)
movdqa xmm3, t1 ; get abs (p1 - p0)
paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
mov rdx, arg(2) ; get flimit
movdqa xmm2, XMMWORD PTR [rdx]
paddb xmm2, xmm2 ; flimit*2 (less than 255)
paddb xmm7, xmm2 ; flimit * 2 + limit (less than 255)
movdqa xmm2, XMMWORD PTR [rdx] ; hev
psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
psubusb xmm4, xmm2 ; hev
psubusb xmm3, xmm2 ; hev
por xmm1, xmm5
pxor xmm5, xmm5
pcmpeqb xmm1, xmm5 ; mask mm1
%endmacro
%macro LFH_HEV_MASK 0
mov rdx, arg(4) ; get thresh
movdqa xmm7, XMMWORD PTR [rdx]
pxor xmm7, xmm7
paddb xmm4, xmm3 ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
movdqa xmm4, t0 ; get abs (q1 - q0)
psubusb xmm4, xmm7
movdqa xmm3, t1 ; get abs (p1 - p0)
psubusb xmm3, xmm7
paddb xmm4, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
pcmpeqb xmm4, xmm5
pcmpeqb xmm4, xmm5 ; hev
pcmpeqb xmm3, xmm3 ; hev
pcmpeqb xmm5, xmm5
pxor xmm4, xmm5
pcmpeqb xmm1, xmm7 ; mask xmm1
pxor xmm4, xmm3 ; hev
%endmacro
%macro BH_FILTER 1
%if %1
movdqa xmm2, [rsi+2*rax] ; p1
movdqa xmm7, [rdi] ; q1
%else
%macro B_FILTER 1
%if %1 == 0
movdqa xmm2, p1 ; p1
movdqa xmm7, q1 ; q1
%elif %1 == 1
movdqa xmm2, [rsi+2*rax] ; p1
movdqa xmm7, [rdi] ; q1
%elif %1 == 2
lea rdx, srct
movdqa xmm2, [rdx] ; p1
movdqa xmm7, [rdx+48] ; q1
movdqa xmm6, [rdx+16] ; p0
movdqa xmm0, [rdx+32] ; q0
%endif
pxor xmm2, [t80 GLOBAL] ; p1 offset to convert to signed values
......@@ -196,88 +193,84 @@
movdqa xmm3, xmm0 ; q0
psubsb xmm0, xmm6 ; q0 - p0
paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
pand xmm1, xmm2 ; mask filter values we don't care about
movdqa xmm2, xmm1
paddsb xmm1, [t4 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
paddsb xmm2, [t3 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
punpckhbw xmm5, xmm2 ; axbxcxdx
punpcklbw xmm2, xmm2 ; exfxgxhx
punpcklbw xmm0, xmm1 ; exfxgxhx
psraw xmm5, 11 ; sign extended shift right by 3
psraw xmm2, 11 ; sign extended shift right by 3
packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
punpcklbw xmm0, xmm1 ; exfxgxhx
punpckhbw xmm1, xmm1 ; axbxcxdx
psraw xmm2, 11 ; sign extended shift right by 3
packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
psraw xmm0, 11 ; sign extended shift right by 3
psraw xmm1, 11 ; sign extended shift right by 3
psraw xmm1, 11 ; sign extended shift right by 3
movdqa xmm5, xmm0 ; save results
packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
paddsw xmm5, [ones GLOBAL]
paddsw xmm1, [ones GLOBAL]
paddsw xmm1, [ones GLOBAL]
psraw xmm5, 1 ; partial shifted one more time for 2nd tap
psraw xmm1, 1 ; partial shifted one more time for 2nd tap
packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
pandn xmm4, xmm5 ; high edge variance additive
%endmacro
psraw xmm1, 1 ; partial shifted one more time for 2nd tap
%macro BH_WRITEBACK 1
paddsb xmm6, xmm2 ; p0+= p0 add
pxor xmm6, [t80 GLOBAL] ; unoffset
%if %1
movdqa [rsi+rax], xmm6 ; write back
%else
lea rsi, [rsi + rcx*2]
lea rdi, [rdi + rcx*2]
movq MMWORD PTR [rsi], xmm6 ; p0
psrldq xmm6, 8
movq MMWORD PTR [rdi], xmm6
%endif
packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
%if %1
movdqa xmm6, [rsi+2*rax] ; p1
%else
movdqa xmm6, p1 ; p1
%if %1 == 0
movdqa xmm1, p1 ; p1
%elif %1 == 1
movdqa xmm1, [rsi+2*rax] ; p1
%elif %1 == 2
movdqa xmm1, [rdx] ; p1
%endif
pxor xmm6, [t80 GLOBAL] ; reoffset
paddsb xmm6, xmm4 ; p1+= p1 add
pandn xmm4, xmm5 ; high edge variance additive
pxor xmm6, [t80 GLOBAL] ; unoffset
%if %1
movdqa [rsi+2*rax], xmm6 ; write back
%else
movq MMWORD PTR [rsi + rax], xmm6 ; p1
psrldq xmm6, 8
movq MMWORD PTR [rdi + rax], xmm6
%endif
pxor xmm1, [t80 GLOBAL] ; reoffset
psubsb xmm3, xmm0 ; q0-= q0 add
paddsb xmm1, xmm4 ; p1+= p1 add
pxor xmm3, [t80 GLOBAL] ; unoffset
%if %1
movdqa [rsi], xmm3 ; write back
%else
movq MMWORD PTR [rsi + rcx], xmm3 ; q0
psrldq xmm3, 8
movq MMWORD PTR [rdi + rcx], xmm3
%endif
pxor xmm1, [t80 GLOBAL] ; unoffset
psubsb xmm7, xmm4 ; q1-= q1 add
pxor xmm7, [t80 GLOBAL] ; unoffset
%if %1
movdqa [rdi], xmm7 ; write back
%else
%if %1 == 0
lea rsi, [rsi + rcx*2]
lea rdi, [rdi + rcx*2]
movq MMWORD PTR [rsi], xmm6 ; p0
movhps MMWORD PTR [rdi], xmm6
movq MMWORD PTR [rsi + rax], xmm1 ; p1
movhps MMWORD PTR [rdi + rax], xmm1
movq MMWORD PTR [rsi + rcx], xmm3 ; q0
movhps MMWORD PTR [rdi + rcx], xmm3
movq MMWORD PTR [rsi + rcx*2],xmm7 ; q1
psrldq xmm7, 8
movq MMWORD PTR [rdi + rcx*2],xmm7
movhps MMWORD PTR [rdi + rcx*2],xmm7
%elif %1 == 1
movdqa [rsi+rax], xmm6 ; write back
movdqa [rsi+2*rax], xmm1 ; write back
movdqa [rsi], xmm3 ; write back
movdqa [rdi], xmm7 ; write back
%endif
%endmacro
......@@ -314,16 +307,10 @@ sym(vp8_loop_filter_horizontal_edge_sse2):
lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing
; calculate breakout conditions
LFH_FILTER_MASK 1
; calculate high edge variance
LFH_HEV_MASK
; start work on filters
BH_FILTER 1
; write back the result
BH_WRITEBACK 1
; calculate breakout conditions and high edge variance
LFH_FILTER_AND_HEV_MASK 1
; filter and write back the result
B_FILTER 1
add rsp, 32
pop rsp
......@@ -378,15 +365,10 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2):
lea rsi, [rsi + rcx]
lea rdi, [rdi + rcx]
; calculate breakout conditions
LFH_FILTER_MASK 0
; calculate high edge variance
LFH_HEV_MASK
; start work on filters
BH_FILTER 0
; write back the result
BH_WRITEBACK 0
; calculate breakout conditions and high edge variance
LFH_FILTER_AND_HEV_MASK 0
; filter and write back the result
B_FILTER 0
add rsp, 96
pop rsp
......@@ -400,208 +382,191 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2):
ret
%macro MBH_FILTER 1
%if %1
movdqa xmm2, [rsi+2*rax] ; p1
movdqa xmm7, [rdi] ; q1
%else
movdqa xmm2, p1 ; p1
movdqa xmm7, q1 ; q1
%endif
pxor xmm2, [t80 GLOBAL] ; p1 offset to convert to signed values
pxor xmm7, [t80 GLOBAL] ; q1 offset to convert to signed values
%macro MB_FILTER_AND_WRITEBACK 1
%if %1 == 0
movdqa xmm2, p1 ; p1
movdqa xmm7, q1 ; q1
%elif %1 == 1
movdqa xmm2, [rsi+2*rax] ; p1
movdqa xmm7, [rdi] ; q1
psubsb xmm2, xmm7 ; p1 - q1
pxor xmm6, [t80 GLOBAL] ; offset to convert to signed values
pxor xmm0, [t80 GLOBAL] ; offset to convert to signed values
movdqa xmm3, xmm0 ; q0
psubsb xmm0, xmm6 ; q0 - p0
paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1)
paddsb xmm2, xmm0 ; 2 * (q0 - p0)
paddsb xmm2, xmm0 ; 3 * (q0 - p0) + (p1 - q1)
mov rcx, rax
neg rcx
%elif %1 == 2
lea rdx, srct
movdqa xmm2, [rdx+32] ; p1
movdqa xmm7, [rdx+80] ; q1
movdqa xmm6, [rdx+48] ; p0
movdqa xmm0, [rdx+64] ; q0
%endif
pand xmm1, xmm2 ; mask filter values we don't care about
movdqa xmm2, xmm1 ; vp8_filter
pand xmm2, xmm4; ; Filter2 = vp8_filter & hev
pxor xmm2, [t80 GLOBAL] ; p1 offset to convert to signed values
pxor xmm7, [t80 GLOBAL] ; q1 offset to convert to signed values
pxor xmm6, [t80 GLOBAL] ; offset to convert to signed values
pxor xmm0, [t80 GLOBAL] ; offset to convert to signed values
movdqa xmm5, xmm2
paddsb xmm5, [t3 GLOBAL] ; vp8_signed_char_clamp(Filter2 + 3)
psubsb xmm2, xmm7 ; p1 - q1
movdqa xmm3, xmm0 ; q0
punpckhbw xmm7, xmm5 ; axbxcxdx
punpcklbw xmm5, xmm5 ; exfxgxhx
psubsb xmm0, xmm6 ; q0 - p0
psraw xmm7, 11 ; sign extended shift right by 3
psraw xmm5, 11 ; sign extended shift right by 3
paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1)
packsswb xmm5, xmm7 ; Filter2 >>=3;
paddsb xmm2, [t4 GLOBAL] ; vp8_signed_char_clamp(Filter2 + 4)
paddsb xmm2, xmm0 ; 2 * (q0 - p0)
punpckhbw xmm7, xmm2 ; axbxcxdx
punpcklbw xmm0, xmm2 ; exfxgxhx
paddsb xmm2, xmm0 ; 3 * (q0 - p0) + (p1 - q1)
psraw xmm7, 11 ; sign extended shift right by 3
psraw xmm0, 11 ; sign extended shift right by 3
pand xmm1, xmm2 ; mask filter values we don't care about
packsswb xmm0, xmm7 ; Filter2 >>=3;
paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2
movdqa xmm2, xmm1 ; vp8_filter
psubsb xmm3, xmm0 ; qs0 =qs0 - filter1
pandn xmm4, xmm1 ; vp8_filter&=~hev
%endmacro
pand xmm2, xmm4 ; Filter2 = vp8_filter & hev
pxor xmm0, xmm0
%macro MBH_WRITEBACK 1
; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
; s = vp8_signed_char_clamp(qs0 - u);
; *oq0 = s^0x80;
; s = vp8_signed_char_clamp(ps0 + u);
; *op0 = s^0x80;
pandn xmm4, xmm1 ; vp8_filter&=~hev
pxor xmm1, xmm1
pxor xmm2, xmm2
punpcklbw xmm1, xmm4
punpcklbw xmm0, xmm4 ; Filter 2 (hi)
movdqa xmm5, xmm2
punpckhbw xmm2, xmm4
pmulhw xmm1, [s27 GLOBAL]
punpckhbw xmm1, xmm4 ; Filter 2 (lo)
paddsb xmm5, [t3 GLOBAL] ; vp8_signed_char_clamp(Filter2 + 3)
pmulhw xmm2, [s27 GLOBAL]
paddw xmm1, [s63 GLOBAL]
pmulhw xmm1, [s9 GLOBAL] ; Filter 2 (lo) * 9
paddw xmm2, [s63 GLOBAL]
psraw xmm1, 7
pmulhw xmm0, [s9 GLOBAL] ; Filter 2 (hi) * 9
psraw xmm2, 7
packsswb xmm1, xmm2
punpckhbw xmm7, xmm5 ; axbxcxdx
paddsb xmm2, [t4 GLOBAL] ; vp8_signed_char_clamp(Filter2 + 4)
psubsb xmm3, xmm1
paddsb xmm6, xmm1
punpcklbw xmm5, xmm5 ; exfxgxhx
psraw xmm7, 11 ; sign extended shift right by 3
pxor xmm3, [t80 GLOBAL]
pxor xmm6, [t80 GLOBAL]
psraw xmm5, 11 ; sign extended shift right by 3
punpckhbw xmm4, xmm2 ; axbxcxdx
%if %1
movdqa XMMWORD PTR [rsi+rax], xmm6
movdqa XMMWORD PTR [rsi], xmm3
%else
lea rsi, [rsi + rcx*2]
lea rdi, [rdi + rcx*2]
punpcklbw xmm2, xmm2 ; exfxgxhx
psraw xmm4, 11 ; sign extended shift right by 3
movq MMWORD PTR [rsi], xmm6 ; p0
psrldq xmm6, 8
movq MMWORD PTR [rdi], xmm6
movq MMWORD PTR [rsi + rcx], xmm3 ; q0
psrldq xmm3, 8
movq MMWORD PTR [rdi + rcx], xmm3
%endif
packsswb xmm5, xmm7 ; Filter2 >>=3;
psraw xmm2, 11 ; sign extended shift right by 3
; roughly 2/7th difference across boundary
; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
; s = vp8_signed_char_clamp(qs1 - u);
; *oq1 = s^0x80;
; s = vp8_signed_char_clamp(ps1 + u);
; *op1 = s^0x80;
pxor xmm1, xmm1
pxor xmm2, xmm2
packsswb xmm2, xmm4 ; Filter1 >>=3;
movdqa