diff --git a/vp8/encoder/x86/quantize_ssse3.asm b/vp8/encoder/x86/quantize_ssse3.asm deleted file mode 100644 index 7b1dc119f080b19b78b3af9ff293404b03e88d07..0000000000000000000000000000000000000000 --- a/vp8/encoder/x86/quantize_ssse3.asm +++ /dev/null @@ -1,138 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" -%include "vp8_asm_enc_offsets.asm" - - -; void vp8_fast_quantize_b_ssse3 | arg -; (BLOCK *b, | 0 -; BLOCKD *d) | 1 -; - -global sym(vp8_fast_quantize_b_ssse3) PRIVATE -sym(vp8_fast_quantize_b_ssse3): - push rbp - mov rbp, rsp - GET_GOT rbx - -%if ABI_IS_32BIT - push rdi - push rsi -%else - %if LIBVPX_YASM_WIN64 - push rdi - push rsi - %endif -%endif - ; end prolog - -%if ABI_IS_32BIT - mov rdi, arg(0) ; BLOCK *b - mov rsi, arg(1) ; BLOCKD *d -%else - %if LIBVPX_YASM_WIN64 - mov rdi, rcx ; BLOCK *b - mov rsi, rdx ; BLOCKD *d - %else - ;mov rdi, rdi ; BLOCK *b - ;mov rsi, rsi ; BLOCKD *d - %endif -%endif - - mov rax, [rdi + vp8_block_coeff] - mov rcx, [rdi + vp8_block_round] - mov rdx, [rdi + vp8_block_quant_fast] - - ; coeff - movdqa xmm0, [rax] - movdqa xmm4, [rax + 16] - - ; round - movdqa xmm2, [rcx] - movdqa xmm3, [rcx + 16] - - movdqa xmm1, xmm0 - movdqa xmm5, xmm4 - - ; sz = z >> 15 - psraw xmm0, 15 - psraw xmm4, 15 - - pabsw xmm1, xmm1 - pabsw xmm5, xmm5 - - paddw xmm1, xmm2 - paddw xmm5, xmm3 - - ; quant_fast - pmulhw xmm1, [rdx] - pmulhw xmm5, [rdx + 16] - - mov rax, [rsi + vp8_blockd_qcoeff] - mov rdi, [rsi + vp8_blockd_dequant] - mov rcx, [rsi + vp8_blockd_dqcoeff] - - movdqa xmm2, xmm1 ;store y for getting eob - movdqa xmm3, xmm5 - - pxor xmm1, xmm0 - pxor xmm5, xmm4 - psubw xmm1, xmm0 - psubw xmm5, xmm4 - - movdqa [rax], xmm1 - movdqa [rax + 16], xmm5 - - movdqa xmm0, [rdi] - movdqa xmm4, [rdi + 16] - - pmullw xmm0, xmm1 - pmullw xmm4, xmm5 - pxor xmm1, xmm1 - - pcmpgtw xmm2, xmm1 ;calculate eob - pcmpgtw xmm3, xmm1 - packsswb xmm2, xmm3 - pshufb xmm2, [GLOBAL(zz_shuf)] - - pmovmskb edx, xmm2 - - movdqa [rcx], xmm0 ;store dqcoeff - movdqa [rcx + 16], xmm4 ;store dqcoeff - mov rcx, [rsi + vp8_blockd_eob] - - bsr eax, edx ;count 0 - add eax, 1 - - cmp edx, 0 ;if all 0, eob=0 - cmove eax, edx - - mov BYTE PTR [rcx], al ;store eob - - ; begin epilog -%if ABI_IS_32BIT - pop rsi - pop rdi -%else - %if LIBVPX_YASM_WIN64 - pop rsi - pop rdi - %endif -%endif - - RESTORE_GOT - pop rbp - ret - -SECTION_RODATA -align 16 -zz_shuf: - db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 diff --git a/vp8/encoder/x86/quantize_ssse3.c b/vp8/encoder/x86/quantize_ssse3.c new file mode 100644 index 0000000000000000000000000000000000000000..9b4471d4f11f79f6916dc2057c4cb1f858389d0f --- /dev/null +++ b/vp8/encoder/x86/quantize_ssse3.c @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <tmmintrin.h> /* SSSE3 */ + +#include "vp8/encoder/block.h" + +/* bitscan reverse (bsr) */ +#if defined(_MSC_VER) +#include <intrin.h> +#pragma intrinsic(_BitScanReverse) +static int bsr(int mask) { + int eob; + _BitScanReverse(&eob, mask); + eob++; + if (mask == 0) + eob = 0; + return eob; +} +#else +static int bsr(int mask) { + int eob; + asm volatile("bsr %1, %0" : "=r" (eob) : "r" (mask) : "flags"); + eob++; + if (mask == 0) + eob = 0; + return eob; +} +#endif + +void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) { + int eob, mask; + + __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); + __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); + __m128i round0 = _mm_load_si128((__m128i *)(b->round)); + __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); + __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast)); + __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8)); + __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); + __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); + + __m128i sz0, sz1, x, x0, x1, y0, y1, zeros, abs0, abs1; + + DECLARE_ALIGNED(16, const uint8_t, pshufb_zig_zag_mask[16]) = + { 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 }; + __m128i zig_zag = _mm_load_si128((const __m128i *)pshufb_zig_zag_mask); + + /* sign of z: z >> 15 */ + sz0 = _mm_srai_epi16(z0, 15); + sz1 = _mm_srai_epi16(z1, 15); + + /* x = abs(z) */ + x0 = _mm_abs_epi16(z0); + x1 = _mm_abs_epi16(z1); + + /* x += round */ + x0 = _mm_add_epi16(x0, round0); + x1 = _mm_add_epi16(x1, round1); + + /* y = (x * quant) >> 16 */ + y0 = _mm_mulhi_epi16(x0, quant_fast0); + y1 = _mm_mulhi_epi16(x1, quant_fast1); + + /* ASM saves Y for EOB */ + /* I think we can ignore that because adding the sign doesn't change anything + * and multiplying 0 by dequant is OK as well */ + abs0 = y0; + abs1 = y1; + + /* Restore the sign bit. */ + y0 = _mm_xor_si128(y0, sz0); + y1 = _mm_xor_si128(y1, sz1); + x0 = _mm_sub_epi16(y0, sz0); + x1 = _mm_sub_epi16(y1, sz1); + + /* qcoeff = x */ + _mm_store_si128((__m128i *)(d->qcoeff), x0); + _mm_store_si128((__m128i *)(d->qcoeff + 8), x1); + + /* x * dequant */ + x0 = _mm_mullo_epi16(x0, dequant0); + x1 = _mm_mullo_epi16(x1, dequant1); + + /* dqcoeff = x * dequant */ + _mm_store_si128((__m128i *)(d->dqcoeff), x0); + _mm_store_si128((__m128i *)(d->dqcoeff + 8), x1); + + zeros = _mm_setzero_si128(); + + x0 = _mm_cmpgt_epi16(abs0, zeros); + x1 = _mm_cmpgt_epi16(abs1, zeros); + + x = _mm_packs_epi16(x0, x1); + + x = _mm_shuffle_epi8(x, zig_zag); + + mask = _mm_movemask_epi8(x); + + eob = bsr(mask); + + *d->eob = 0xFF & eob; +} diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index d7c6dd1e1e6000cea889bbbbafed1c747fa9ab21..607382b4c4015830c767c7f8962a309426866a2e 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -88,6 +88,7 @@ VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c +VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.c ifeq ($(CONFIG_TEMPORAL_DENOISING),yes) VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c @@ -96,7 +97,6 @@ endif VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c -VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.asm VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm