Commit 3fb55d24 authored by James Zern's avatar James Zern

Revert "Code clean of sub_pixel_variance4xh"

This reverts commit 2468163e.

causes valgrind errors for overread of buffer in SubpelVarianceTest

Change-Id: I448e52c76f815ac199305b71f7d169f2bc167679
parent fa5d54f9
...@@ -1026,8 +1026,8 @@ INSTANTIATE_TEST_CASE_P( ...@@ -1026,8 +1026,8 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(3, 4, &vpx_sub_pixel_variance8x16_sse2, 0), make_tuple(3, 4, &vpx_sub_pixel_variance8x16_sse2, 0),
make_tuple(3, 3, &vpx_sub_pixel_variance8x8_sse2, 0), make_tuple(3, 3, &vpx_sub_pixel_variance8x8_sse2, 0),
make_tuple(3, 2, &vpx_sub_pixel_variance8x4_sse2, 0), make_tuple(3, 2, &vpx_sub_pixel_variance8x4_sse2, 0),
make_tuple(2, 3, &vpx_sub_pixel_variance4x8_sse2, 0), make_tuple(2, 3, &vpx_sub_pixel_variance4x8_sse, 0),
make_tuple(2, 2, &vpx_sub_pixel_variance4x4_sse2, 0))); make_tuple(2, 2, &vpx_sub_pixel_variance4x4_sse, 0)));
INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P(
SSE2, VpxSubpelAvgVarianceTest, SSE2, VpxSubpelAvgVarianceTest,
...@@ -1043,8 +1043,8 @@ INSTANTIATE_TEST_CASE_P( ...@@ -1043,8 +1043,8 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_sse2, 0), make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_sse2, 0),
make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_sse2, 0), make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_sse2, 0),
make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0), make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0),
make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_sse2, 0), make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_sse, 0),
make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0))); make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse, 0)));
#endif // CONFIG_USE_X86INC #endif // CONFIG_USE_X86INC
#if CONFIG_VP9_HIGHBITDEPTH #if CONFIG_VP9_HIGHBITDEPTH
......
...@@ -1493,10 +1493,10 @@ add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int s ...@@ -1493,10 +1493,10 @@ add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int s
specialize qw/vpx_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc"; specialize qw/vpx_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_sub_pixel_variance4x8 msa/, "$sse2_x86inc", "$ssse3_x86inc"; specialize qw/vpx_sub_pixel_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_sub_pixel_variance4x4 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc"; specialize qw/vpx_sub_pixel_variance4x4 mmx msa/, "$sse_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_sub_pixel_avg_variance64x64 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc"; specialize qw/vpx_sub_pixel_avg_variance64x64 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc";
...@@ -1532,10 +1532,10 @@ add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, i ...@@ -1532,10 +1532,10 @@ add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, i
specialize qw/vpx_sub_pixel_avg_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc"; specialize qw/vpx_sub_pixel_avg_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_sub_pixel_avg_variance4x8 msa/, "$sse2_x86inc", "$ssse3_x86inc"; specialize qw/vpx_sub_pixel_avg_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_sub_pixel_avg_variance4x4 msa/, "$sse2_x86inc", "$ssse3_x86inc"; specialize qw/vpx_sub_pixel_avg_variance4x4 msa/, "$sse_x86inc", "$ssse3_x86inc";
# #
# Specialty Subpixel # Specialty Subpixel
......
...@@ -57,8 +57,8 @@ SECTION .text ...@@ -57,8 +57,8 @@ SECTION .text
paddd %6, %1 paddd %6, %1
%endmacro %endmacro
%macro STORE_AND_RET 1 %macro STORE_AND_RET 0
%if %1 > 4 %if mmsize == 16
; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
; We have to sign-extend it before adding the words within the register ; We have to sign-extend it before adding the words within the register
...@@ -78,16 +78,16 @@ SECTION .text ...@@ -78,16 +78,16 @@ SECTION .text
movd [r1], m7 ; store sse movd [r1], m7 ; store sse
paddd m6, m4 paddd m6, m4
movd raxd, m6 ; store sum as return value movd raxd, m6 ; store sum as return value
%else ; 4xh %else ; mmsize == 8
pshuflw m4, m6, 0xe pshufw m4, m6, 0xe
pshuflw m3, m7, 0xe pshufw m3, m7, 0xe
paddw m6, m4 paddw m6, m4
paddd m7, m3 paddd m7, m3
pcmpgtw m5, m6 ; mask for 0 > x pcmpgtw m5, m6 ; mask for 0 > x
mov r1, ssem ; r1 = unsigned int *sse mov r1, ssem ; r1 = unsigned int *sse
punpcklwd m6, m5 ; sign-extend m6 word->dword punpcklwd m6, m5 ; sign-extend m6 word->dword
movd [r1], m7 ; store sse movd [r1], m7 ; store sse
pshuflw m4, m6, 0xe pshufw m4, m6, 0xe
paddd m6, m4 paddd m6, m4
movd raxd, m6 ; store sum as return value movd raxd, m6 ; store sum as return value
%endif %endif
...@@ -226,14 +226,8 @@ SECTION .text ...@@ -226,14 +226,8 @@ SECTION .text
punpckhbw m3, m1, m5 punpckhbw m3, m1, m5
punpcklbw m1, m5 punpcklbw m1, m5
%endif %endif
%if %1 > 4
punpckhbw m2, m0, m5 punpckhbw m2, m0, m5
punpcklbw m0, m5 punpcklbw m0, m5
%else
punpcklbw m0, m5
movhlps m2, m0
%endif
%if %2 == 0 ; !avg %if %2 == 0 ; !avg
punpckhbw m3, m1, m5 punpckhbw m3, m1, m5
punpcklbw m1, m5 punpcklbw m1, m5
...@@ -245,40 +239,22 @@ SECTION .text ...@@ -245,40 +239,22 @@ SECTION .text
%else ; %1 < 16 %else ; %1 < 16
movh m0, [srcq] movh m0, [srcq]
%if %2 == 1 ; avg %if %2 == 1 ; avg
%if %1 > 4 %if mmsize == 16
movhps m0, [srcq+src_strideq] movhps m0, [srcq+src_strideq]
%else ; 4xh %else ; mmsize == 8
movd m1, [srcq+src_strideq] punpckldq m0, [srcq+src_strideq]
punpckldq m0, m1
%endif %endif
%else ; !avg %else ; !avg
movh m2, [srcq+src_strideq] movh m2, [srcq+src_strideq]
%endif %endif
%if %1 > 4
movh m1, [dstq] movh m1, [dstq]
movh m3, [dstq+dst_strideq] movh m3, [dstq+dst_strideq]
%else ; 4xh
movd m1, [dstq]
movd m3, [dstq+dst_strideq]
%endif
%if %2 == 1 ; avg %if %2 == 1 ; avg
%if %1 > 4
pavgb m0, [secq] pavgb m0, [secq]
%else
movh m2, [secq]
pavgb m0, m2
%endif
punpcklbw m3, m5 punpcklbw m3, m5
punpcklbw m1, m5 punpcklbw m1, m5
%if %1 > 4
punpckhbw m2, m0, m5 punpckhbw m2, m0, m5
punpcklbw m0, m5 punpcklbw m0, m5
%else ; 4xh
punpcklbw m0, m5
movhlps m2, m0
%endif
%else ; !avg %else ; !avg
punpcklbw m0, m5 punpcklbw m0, m5
punpcklbw m2, m5 punpcklbw m2, m5
...@@ -295,7 +271,7 @@ SECTION .text ...@@ -295,7 +271,7 @@ SECTION .text
%endif %endif
dec block_height dec block_height
jg .x_zero_y_zero_loop jg .x_zero_y_zero_loop
STORE_AND_RET %1 STORE_AND_RET
.x_zero_y_nonzero: .x_zero_y_nonzero:
cmp y_offsetd, 4 cmp y_offsetd, 4
...@@ -323,9 +299,9 @@ SECTION .text ...@@ -323,9 +299,9 @@ SECTION .text
movh m0, [srcq] movh m0, [srcq]
movh m2, [srcq+src_strideq] movh m2, [srcq+src_strideq]
%if %2 == 1 ; avg %if %2 == 1 ; avg
%if %1 > 4 %if mmsize == 16
movhps m2, [srcq+src_strideq*2] movhps m2, [srcq+src_strideq*2]
%else ; 4xh %else ; mmsize == 8
%if %1 == 4 %if %1 == 4
movh m1, [srcq+src_strideq*2] movh m1, [srcq+src_strideq*2]
punpckldq m2, m1 punpckldq m2, m1
...@@ -334,26 +310,18 @@ SECTION .text ...@@ -334,26 +310,18 @@ SECTION .text
%endif %endif
%endif %endif
movh m1, [dstq] movh m1, [dstq]
%if %1 > 4 %if mmsize == 16
movlhps m0, m2 movlhps m0, m2
%else ; 4xh %else ; mmsize == 8
punpckldq m0, m2 punpckldq m0, m2
%endif %endif
movh m3, [dstq+dst_strideq] movh m3, [dstq+dst_strideq]
pavgb m0, m2 pavgb m0, m2
punpcklbw m1, m5 punpcklbw m1, m5
%if %1 > 4
pavgb m0, [secq] pavgb m0, [secq]
punpcklbw m3, m5 punpcklbw m3, m5
punpckhbw m2, m0, m5 punpckhbw m2, m0, m5
punpcklbw m0, m5 punpcklbw m0, m5
%else ; 4xh
movh m4, [secq]
pavgb m0, m4
punpcklbw m3, m5
punpcklbw m0, m5
movhlps m2, m0
%endif
%else ; !avg %else ; !avg
movh m4, [srcq+src_strideq*2] movh m4, [srcq+src_strideq*2]
movh m1, [dstq] movh m1, [dstq]
...@@ -375,7 +343,7 @@ SECTION .text ...@@ -375,7 +343,7 @@ SECTION .text
%endif %endif
dec block_height dec block_height
jg .x_zero_y_half_loop jg .x_zero_y_half_loop
STORE_AND_RET %1 STORE_AND_RET
.x_zero_y_nonhalf: .x_zero_y_nonhalf:
; x_offset == 0 && y_offset == bilin interpolation ; x_offset == 0 && y_offset == bilin interpolation
...@@ -383,7 +351,7 @@ SECTION .text ...@@ -383,7 +351,7 @@ SECTION .text
lea bilin_filter, [bilin_filter_m] lea bilin_filter, [bilin_filter_m]
%endif %endif
shl y_offsetd, filter_idx_shift shl y_offsetd, filter_idx_shift
%if ARCH_X86_64 && %1 > 4 %if ARCH_X86_64 && mmsize == 16
mova m8, [bilin_filter+y_offsetq] mova m8, [bilin_filter+y_offsetq]
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
mova m9, [bilin_filter+y_offsetq+16] mova m9, [bilin_filter+y_offsetq+16]
...@@ -488,20 +456,10 @@ SECTION .text ...@@ -488,20 +456,10 @@ SECTION .text
psraw m2, 4 psraw m2, 4
%if %2 == 1 ; avg %if %2 == 1 ; avg
; FIXME(rbultje) pipeline ; FIXME(rbultje) pipeline
%if %1 == 4
movlhps m0, m2
%endif
packuswb m0, m2 packuswb m0, m2
%if %1 > 4
pavgb m0, [secq] pavgb m0, [secq]
punpckhbw m2, m0, m5 punpckhbw m2, m0, m5
punpcklbw m0, m5 punpcklbw m0, m5
%else ; 4xh
movh m2, [secq]
pavgb m0, m2
punpcklbw m0, m5
movhlps m2, m0
%endif
%endif %endif
punpcklbw m1, m5 punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7 SUM_SSE m0, m1, m2, m3, m6, m7
...@@ -517,7 +475,7 @@ SECTION .text ...@@ -517,7 +475,7 @@ SECTION .text
%undef filter_y_a %undef filter_y_a
%undef filter_y_b %undef filter_y_b
%undef filter_rnd %undef filter_rnd
STORE_AND_RET %1 STORE_AND_RET
.x_nonzero: .x_nonzero:
cmp x_offsetd, 4 cmp x_offsetd, 4
...@@ -548,31 +506,21 @@ SECTION .text ...@@ -548,31 +506,21 @@ SECTION .text
movh m0, [srcq] movh m0, [srcq]
movh m4, [srcq+1] movh m4, [srcq+1]
%if %2 == 1 ; avg %if %2 == 1 ; avg
%if %1 > 4 %if mmsize == 16
movhps m0, [srcq+src_strideq] movhps m0, [srcq+src_strideq]
movhps m4, [srcq+src_strideq+1] movhps m4, [srcq+src_strideq+1]
%else ; 4xh %else ; mmsize == 8
movd m1, [srcq+src_strideq] punpckldq m0, [srcq+src_strideq]
punpckldq m0, m1 punpckldq m4, [srcq+src_strideq+1]
movd m2, [srcq+src_strideq+1]
punpckldq m4, m2
%endif %endif
movh m1, [dstq] movh m1, [dstq]
movh m3, [dstq+dst_strideq] movh m3, [dstq+dst_strideq]
pavgb m0, m4 pavgb m0, m4
punpcklbw m3, m5 punpcklbw m3, m5
%if %1 > 4
pavgb m0, [secq] pavgb m0, [secq]
punpcklbw m1, m5 punpcklbw m1, m5
punpckhbw m2, m0, m5 punpckhbw m2, m0, m5
punpcklbw m0, m5 punpcklbw m0, m5
%else ; 4xh
movh m2, [secq]
pavgb m0, m2
punpcklbw m1, m5
punpcklbw m0, m5
movhlps m2, m0
%endif
%else ; !avg %else ; !avg
movh m2, [srcq+src_strideq] movh m2, [srcq+src_strideq]
movh m1, [dstq] movh m1, [dstq]
...@@ -595,7 +543,7 @@ SECTION .text ...@@ -595,7 +543,7 @@ SECTION .text
%endif %endif
dec block_height dec block_height
jg .x_half_y_zero_loop jg .x_half_y_zero_loop
STORE_AND_RET %1 STORE_AND_RET
.x_half_y_nonzero: .x_half_y_nonzero:
cmp y_offsetd, 4 cmp y_offsetd, 4
...@@ -638,7 +586,7 @@ SECTION .text ...@@ -638,7 +586,7 @@ SECTION .text
movh m2, [srcq] movh m2, [srcq]
movh m3, [srcq+1] movh m3, [srcq+1]
%if %2 == 1 ; avg %if %2 == 1 ; avg
%if %1 > 4 %if mmsize == 16
movhps m2, [srcq+src_strideq] movhps m2, [srcq+src_strideq]
movhps m3, [srcq+src_strideq+1] movhps m3, [srcq+src_strideq+1]
%else %else
...@@ -653,31 +601,21 @@ SECTION .text ...@@ -653,31 +601,21 @@ SECTION .text
%endif %endif
%endif %endif
pavgb m2, m3 pavgb m2, m3
%if %1 > 4 %if mmsize == 16
movlhps m0, m2 movlhps m0, m2
movhlps m4, m2 movhlps m4, m2
%else ; 4xh %else ; mmsize == 8
punpckldq m0, m2 punpckldq m0, m2
pshuflw m4, m2, 0xe pshufw m4, m2, 0xe
%endif %endif
movh m1, [dstq] movh m1, [dstq]
pavgb m0, m2 pavgb m0, m2
movh m3, [dstq+dst_strideq] movh m3, [dstq+dst_strideq]
%if %1 > 4
pavgb m0, [secq] pavgb m0, [secq]
%else
movh m2, [secq]
pavgb m0, m2
%endif
punpcklbw m3, m5 punpcklbw m3, m5
punpcklbw m1, m5 punpcklbw m1, m5
%if %1 > 4
punpckhbw m2, m0, m5 punpckhbw m2, m0, m5
punpcklbw m0, m5 punpcklbw m0, m5
%else
punpcklbw m0, m5
movhlps m2, m0
%endif
%else ; !avg %else ; !avg
movh m4, [srcq+src_strideq] movh m4, [srcq+src_strideq]
movh m1, [srcq+src_strideq+1] movh m1, [srcq+src_strideq+1]
...@@ -703,7 +641,7 @@ SECTION .text ...@@ -703,7 +641,7 @@ SECTION .text
%endif %endif
dec block_height dec block_height
jg .x_half_y_half_loop jg .x_half_y_half_loop
STORE_AND_RET %1 STORE_AND_RET
.x_half_y_nonhalf: .x_half_y_nonhalf:
; x_offset == 0.5 && y_offset == bilin interpolation ; x_offset == 0.5 && y_offset == bilin interpolation
...@@ -711,7 +649,7 @@ SECTION .text ...@@ -711,7 +649,7 @@ SECTION .text
lea bilin_filter, [bilin_filter_m] lea bilin_filter, [bilin_filter_m]
%endif %endif
shl y_offsetd, filter_idx_shift shl y_offsetd, filter_idx_shift
%if ARCH_X86_64 && %1 > 4 %if ARCH_X86_64 && mmsize == 16
mova m8, [bilin_filter+y_offsetq] mova m8, [bilin_filter+y_offsetq]
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
mova m9, [bilin_filter+y_offsetq+16] mova m9, [bilin_filter+y_offsetq+16]
...@@ -828,20 +766,10 @@ SECTION .text ...@@ -828,20 +766,10 @@ SECTION .text
psraw m2, 4 psraw m2, 4
%if %2 == 1 ; avg %if %2 == 1 ; avg
; FIXME(rbultje) pipeline ; FIXME(rbultje) pipeline
%if %1 == 4
movlhps m0, m2
%endif
packuswb m0, m2 packuswb m0, m2
%if %1 > 4
pavgb m0, [secq] pavgb m0, [secq]
punpckhbw m2, m0, m5 punpckhbw m2, m0, m5
punpcklbw m0, m5 punpcklbw m0, m5
%else
movh m2, [secq]
pavgb m0, m2
punpcklbw m0, m5
movhlps m2, m0
%endif
%endif %endif
punpcklbw m1, m5 punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7 SUM_SSE m0, m1, m2, m3, m6, m7
...@@ -858,7 +786,7 @@ SECTION .text ...@@ -858,7 +786,7 @@ SECTION .text
%undef filter_y_a %undef filter_y_a
%undef filter_y_b %undef filter_y_b
%undef filter_rnd %undef filter_rnd
STORE_AND_RET %1 STORE_AND_RET
.x_nonhalf: .x_nonhalf:
test y_offsetd, y_offsetd test y_offsetd, y_offsetd
...@@ -869,7 +797,7 @@ SECTION .text ...@@ -869,7 +797,7 @@ SECTION .text
lea bilin_filter, [bilin_filter_m] lea bilin_filter, [bilin_filter_m]
%endif %endif
shl x_offsetd, filter_idx_shift shl x_offsetd, filter_idx_shift
%if ARCH_X86_64 && %1 > 4 %if ARCH_X86_64 && mmsize == 16
mova m8, [bilin_filter+x_offsetq] mova m8, [bilin_filter+x_offsetq]
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
mova m9, [bilin_filter+x_offsetq+16] mova m9, [bilin_filter+x_offsetq+16]
...@@ -971,20 +899,10 @@ SECTION .text ...@@ -971,20 +899,10 @@ SECTION .text
psraw m2, 4 psraw m2, 4
%if %2 == 1 ; avg %if %2 == 1 ; avg
; FIXME(rbultje) pipeline ; FIXME(rbultje) pipeline
%if %1 == 4
movlhps m0, m2
%endif
packuswb m0, m2 packuswb m0, m2
%if %1 > 4
pavgb m0, [secq] pavgb m0, [secq]
punpckhbw m2, m0, m5 punpckhbw m2, m0, m5
punpcklbw m0, m5 punpcklbw m0, m5
%else
movh m2, [secq]
pavgb m0, m2
punpcklbw m0, m5
movhlps m2, m0
%endif
%endif %endif
punpcklbw m1, m5 punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7 SUM_SSE m0, m1, m2, m3, m6, m7
...@@ -1000,7 +918,7 @@ SECTION .text ...@@ -1000,7 +918,7 @@ SECTION .text
%undef filter_x_a %undef filter_x_a
%undef filter_x_b %undef filter_x_b
%undef filter_rnd %undef filter_rnd
STORE_AND_RET %1 STORE_AND_RET
.x_nonhalf_y_nonzero: .x_nonhalf_y_nonzero:
cmp y_offsetd, 4 cmp y_offsetd, 4
...@@ -1011,7 +929,7 @@ SECTION .text ...@@ -1011,7 +929,7 @@ SECTION .text
lea bilin_filter, [bilin_filter_m] lea bilin_filter, [bilin_filter_m]
%endif %endif
shl x_offsetd, filter_idx_shift shl x_offsetd, filter_idx_shift
%if ARCH_X86_64 && %1 > 4 %if ARCH_X86_64 && mmsize == 16
mova m8, [bilin_filter+x_offsetq] mova m8, [bilin_filter+x_offsetq]
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
mova m9, [bilin_filter+x_offsetq+16] mova m9, [bilin_filter+x_offsetq+16]
...@@ -1171,20 +1089,10 @@ SECTION .text ...@@ -1171,20 +1089,10 @@ SECTION .text
pavgw m2, m4 pavgw m2, m4
%if %2 == 1 ; avg %if %2 == 1 ; avg
; FIXME(rbultje) pipeline - also consider going to bytes here ; FIXME(rbultje) pipeline - also consider going to bytes here
%if %1 == 4
movlhps m0, m2
%endif
packuswb m0, m2 packuswb m0, m2
%if %1 > 4
pavgb m0, [secq] pavgb m0, [secq]
punpckhbw m2, m0, m5 punpckhbw m2, m0, m5
punpcklbw m0, m5 punpcklbw m0, m5
%else
movh m2, [secq]
pavgb m0, m2
punpcklbw m0, m5