Commit d7289658 authored by Yunqing Wang's avatar Yunqing Wang

Remove TEXTREL from 32bit encoder

This patch fixed the issue reported in "Issue 655: remove textrel's
from 32-bit vp9 encoder". The set of vp9_subpel_variance functions
that used x86inc.asm ABI didn't build correctly for 32bit PIC. The
fix was carefully done under the situation that there was not
enough registers.

After the change, we got
$ eu-findtextrel libvpx.so
eu-findtextrel: no text relocations reported in 'libvpx.so'

Change-Id: I1b176311dedaf48eaee0a1e777588043c97cea82
parent dd45530e
...@@ -118,6 +118,14 @@ SECTION .text ...@@ -118,6 +118,14 @@ SECTION .text
RET RET
%endmacro %endmacro
%macro INC_SRC_BY_SRC_STRIDE 0
%if ARCH_X86=1 && CONFIG_PIC=1
add srcq, src_stridemp
%else
add srcq, src_strideq
%endif
%endmacro
%macro SUBPEL_VARIANCE 1-2 0 ; W %macro SUBPEL_VARIANCE 1-2 0 ; W
%if cpuflag(ssse3) %if cpuflag(ssse3)
%define bilin_filter_m bilin_filter_m_ssse3 %define bilin_filter_m bilin_filter_m_ssse3
...@@ -129,41 +137,85 @@ SECTION .text ...@@ -129,41 +137,85 @@ SECTION .text
; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses ; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
; 11, not 13, if the registers are ordered correctly. May make a minor speed ; 11, not 13, if the registers are ordered correctly. May make a minor speed
; difference on Win64 ; difference on Win64
%ifdef PIC
%if %2 == 1 ; avg %ifdef PIC ; 64bit PIC
cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ %if %2 == 1 ; avg
x_offset, y_offset, \ cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
dst, dst_stride, \ x_offset, y_offset, \
sec, sec_stride, height, sse dst, dst_stride, \
%define sec_str sec_strideq sec, sec_stride, height, sse
%else %define sec_str sec_strideq
cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, y_offset, \ %else
dst, dst_stride, height, sse cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
%endif y_offset, dst, dst_stride, height, sse
%define h heightd %endif
%define bilin_filter sseq %define h heightd
%else %define bilin_filter sseq
%if %2 == 1 ; avg
cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
7 + 2 * ARCH_X86_64, 13, src, src_stride, \
x_offset, y_offset, \
dst, dst_stride, \
sec, sec_stride, \
height, sse
%if ARCH_X86_64
%define h heightd
%define sec_str sec_strideq
%else
%define h dword heightm
%define sec_str sec_stridemp
%endif
%else %else
cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ %if ARCH_X86=1 && CONFIG_PIC=1
dst, dst_stride, height, sse %if %2 == 1 ; avg
%define h heightd cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
%endif x_offset, y_offset, \
%define bilin_filter bilin_filter_m dst, dst_stride, \
sec, sec_stride, \
height, sse, g_bilin_filter, g_pw_8
%define h dword heightm
%define sec_str sec_stridemp
;Store bilin_filter and pw_8 location in stack
GET_GOT eax
add esp, 4 ; restore esp
lea ecx, [GLOBAL(bilin_filter_m)]
mov g_bilin_filterm, ecx
lea ecx, [GLOBAL(pw_8)]
mov g_pw_8m, ecx
LOAD_IF_USED 0, 1 ; load eax, ecx back
%else
cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
y_offset, dst, dst_stride, height, sse, \
g_bilin_filter, g_pw_8
%define h heightd
;Store bilin_filter and pw_8 location in stack
GET_GOT eax
add esp, 4 ; restore esp
lea ecx, [GLOBAL(bilin_filter_m)]
mov g_bilin_filterm, ecx
lea ecx, [GLOBAL(pw_8)]
mov g_pw_8m, ecx
LOAD_IF_USED 0, 1 ; load eax, ecx back
%endif
%else
%if %2 == 1 ; avg
cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
7 + 2 * ARCH_X86_64, 13, src, src_stride, \
x_offset, y_offset, \
dst, dst_stride, \
sec, sec_stride, \
height, sse
%if ARCH_X86_64
%define h heightd
%define sec_str sec_strideq
%else
%define h dword heightm
%define sec_str sec_stridemp
%endif
%else
cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
y_offset, dst, dst_stride, height, sse
%define h heightd
%endif
%define bilin_filter bilin_filter_m
%endif
%endif %endif
ASSERT %1 <= 16 ; m6 overflows if w > 16 ASSERT %1 <= 16 ; m6 overflows if w > 16
pxor m6, m6 ; sum pxor m6, m6 ; sum
pxor m7, m7 ; sse pxor m7, m7 ; sse
...@@ -329,11 +381,22 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ ...@@ -329,11 +381,22 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
%define filter_y_b m9 %define filter_y_b m9
%define filter_rnd m10 %define filter_rnd m10
%else ; x86-32 or mmx %else ; x86-32 or mmx
%if ARCH_X86=1 && CONFIG_PIC=1
; x_offset == 0, reuse x_offset reg
%define tempq x_offsetq
add y_offsetq, g_bilin_filterm
%define filter_y_a [y_offsetq]
%define filter_y_b [y_offsetq+16]
mov tempq, g_pw_8m
%define filter_rnd [tempq]
%else
add y_offsetq, bilin_filter add y_offsetq, bilin_filter
%define filter_y_a [y_offsetq] %define filter_y_a [y_offsetq]
%define filter_y_b [y_offsetq+16] %define filter_y_b [y_offsetq+16]
%define filter_rnd [pw_8] %define filter_rnd [pw_8]
%endif %endif
%endif
.x_zero_y_other_loop: .x_zero_y_other_loop:
%if %1 == 16 %if %1 == 16
movu m0, [srcq] movu m0, [srcq]
...@@ -615,12 +678,23 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ ...@@ -615,12 +678,23 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
%define filter_y_a m8 %define filter_y_a m8
%define filter_y_b m9 %define filter_y_b m9
%define filter_rnd m10 %define filter_rnd m10
%else ;x86_32
%if ARCH_X86=1 && CONFIG_PIC=1
; x_offset == 0.5. We can reuse x_offset reg
%define tempq x_offsetq
add y_offsetq, g_bilin_filterm
%define filter_y_a [y_offsetq]
%define filter_y_b [y_offsetq+16]
mov tempq, g_pw_8m
%define filter_rnd [tempq]
%else %else
add y_offsetq, bilin_filter add y_offsetq, bilin_filter
%define filter_y_a [y_offsetq] %define filter_y_a [y_offsetq]
%define filter_y_b [y_offsetq+16] %define filter_y_b [y_offsetq+16]
%define filter_rnd [pw_8] %define filter_rnd [pw_8]
%endif %endif
%endif
%if %1 == 16 %if %1 == 16
movu m0, [srcq] movu m0, [srcq]
movu m3, [srcq+1] movu m3, [srcq+1]
...@@ -752,12 +826,23 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ ...@@ -752,12 +826,23 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
%define filter_x_a m8 %define filter_x_a m8
%define filter_x_b m9 %define filter_x_b m9
%define filter_rnd m10 %define filter_rnd m10
%else ; x86-32
%if ARCH_X86=1 && CONFIG_PIC=1
;y_offset == 0. We can reuse y_offset reg.
%define tempq y_offsetq
add x_offsetq, g_bilin_filterm
%define filter_x_a [x_offsetq]
%define filter_x_b [x_offsetq+16]
mov tempq, g_pw_8m
%define filter_rnd [tempq]
%else %else
add x_offsetq, bilin_filter add x_offsetq, bilin_filter
%define filter_x_a [x_offsetq] %define filter_x_a [x_offsetq]
%define filter_x_b [x_offsetq+16] %define filter_x_b [x_offsetq+16]
%define filter_rnd [pw_8] %define filter_rnd [pw_8]
%endif %endif
%endif
.x_other_y_zero_loop: .x_other_y_zero_loop:
%if %1 == 16 %if %1 == 16
movu m0, [srcq] movu m0, [srcq]
...@@ -873,12 +958,23 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ ...@@ -873,12 +958,23 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
%define filter_x_a m8 %define filter_x_a m8
%define filter_x_b m9 %define filter_x_b m9
%define filter_rnd m10 %define filter_rnd m10
%else ; x86-32
%if ARCH_X86=1 && CONFIG_PIC=1
; y_offset == 0.5. We can reuse y_offset reg.
%define tempq y_offsetq
add x_offsetq, g_bilin_filterm
%define filter_x_a [x_offsetq]
%define filter_x_b [x_offsetq+16]
mov tempq, g_pw_8m
%define filter_rnd [tempq]
%else %else
add x_offsetq, bilin_filter add x_offsetq, bilin_filter
%define filter_x_a [x_offsetq] %define filter_x_a [x_offsetq]
%define filter_x_b [x_offsetq+16] %define filter_x_b [x_offsetq+16]
%define filter_rnd [pw_8] %define filter_rnd [pw_8]
%endif %endif
%endif
%if %1 == 16 %if %1 == 16
movu m0, [srcq] movu m0, [srcq]
movu m1, [srcq+1] movu m1, [srcq+1]
...@@ -1057,6 +1153,21 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ ...@@ -1057,6 +1153,21 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
%define filter_y_a m10 %define filter_y_a m10
%define filter_y_b m11 %define filter_y_b m11
%define filter_rnd m12 %define filter_rnd m12
%else ; x86-32
%if ARCH_X86=1 && CONFIG_PIC=1
; In this case, there is NO unused register. Used src_stride register. Later,
; src_stride has to be loaded from stack when it is needed.
%define tempq src_strideq
mov tempq, g_bilin_filterm
add x_offsetq, tempq
add y_offsetq, tempq
%define filter_x_a [x_offsetq]
%define filter_x_b [x_offsetq+16]
%define filter_y_a [y_offsetq]
%define filter_y_b [y_offsetq+16]
mov tempq, g_pw_8m
%define filter_rnd [tempq]
%else %else
add x_offsetq, bilin_filter add x_offsetq, bilin_filter
add y_offsetq, bilin_filter add y_offsetq, bilin_filter
...@@ -1066,6 +1177,8 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ ...@@ -1066,6 +1177,8 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
%define filter_y_b [y_offsetq+16] %define filter_y_b [y_offsetq+16]
%define filter_rnd [pw_8] %define filter_rnd [pw_8]
%endif %endif
%endif
; x_offset == bilin interpolation && y_offset == bilin interpolation ; x_offset == bilin interpolation && y_offset == bilin interpolation
%if %1 == 16 %if %1 == 16
movu m0, [srcq] movu m0, [srcq]
...@@ -1093,7 +1206,9 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ ...@@ -1093,7 +1206,9 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
%endif %endif
psraw m0, 4 psraw m0, 4
psraw m2, 4 psraw m2, 4
add srcq, src_strideq
INC_SRC_BY_SRC_STRIDE
packuswb m0, m2 packuswb m0, m2
.x_other_y_other_loop: .x_other_y_other_loop:
%if cpuflag(ssse3) %if cpuflag(ssse3)
...@@ -1163,7 +1278,7 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ ...@@ -1163,7 +1278,7 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
SUM_SSE m0, m1, m2, m3, m6, m7 SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4 mova m0, m4
add srcq, src_strideq INC_SRC_BY_SRC_STRIDE
add dstq, dst_strideq add dstq, dst_strideq
%else ; %1 < 16 %else ; %1 < 16
movh m0, [srcq] movh m0, [srcq]
...@@ -1184,12 +1299,17 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ ...@@ -1184,12 +1299,17 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
%if cpuflag(ssse3) %if cpuflag(ssse3)
packuswb m0, m0 packuswb m0, m0
%endif %endif
add srcq, src_strideq
INC_SRC_BY_SRC_STRIDE
.x_other_y_other_loop: .x_other_y_other_loop:
movh m2, [srcq] movh m2, [srcq]
movh m1, [srcq+1] movh m1, [srcq+1]
movh m4, [srcq+src_strideq]
movh m3, [srcq+src_strideq+1] INC_SRC_BY_SRC_STRIDE
movh m4, [srcq]
movh m3, [srcq+1]
%if cpuflag(ssse3) %if cpuflag(ssse3)
punpcklbw m2, m1 punpcklbw m2, m1
punpcklbw m4, m3 punpcklbw m4, m3
...@@ -1253,7 +1373,7 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ ...@@ -1253,7 +1373,7 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
SUM_SSE m0, m1, m2, m3, m6, m7 SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4 mova m0, m4
lea srcq, [srcq+src_strideq*2] INC_SRC_BY_SRC_STRIDE
lea dstq, [dstq+dst_strideq*2] lea dstq, [dstq+dst_strideq*2]
%endif %endif
%if %2 == 1 ; avg %if %2 == 1 ; avg
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment