diff --git a/.gitignore b/.gitignore index ae616b28c89bf5723f40feb83fc4b7c2b4db0973..4074b0bbf565da62b03fdb604b8cb34aa3628c05 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,8 @@ /ivfdec.dox /ivfenc /ivfenc.dox +/libvpx.so* +/libvpx.ver /obj_int_extract /postproc /postproc.c @@ -43,12 +45,12 @@ /simple_encoder /simple_encoder.c /simple_encoder.dox +/test_libvpx /twopass_encoder /twopass_encoder.c /twopass_encoder.dox /vp8_api1_migration.dox /vp8_scalable_patterns -/vp8_scalable_patterns.c /vp8_scalable_patterns.dox /vp8_set_maps /vp8_set_maps.c @@ -56,7 +58,14 @@ /vp8cx_set_ref /vp8cx_set_ref.c /vp8cx_set_ref.dox +/vpx.pc /vpx_config.c /vpx_config.h +/vpx_rtcd.h /vpx_version.h +/vpxdec +/vpxenc TAGS +.cproject +.project +.settings diff --git a/.mailmap b/.mailmap index 2e1d4a9f93d1470190315b8bde655375cfaf4acf..ba1279bbbc3acc940aef96111fa6483fbb448113 100644 --- a/.mailmap +++ b/.mailmap @@ -3,3 +3,6 @@ Johann Koenig Tero Rintaluoma Tom Finegan Ralph Giles +Ralph Giles +Alpha Lam +Deb Mukherjee diff --git a/AUTHORS b/AUTHORS index a93df45e6bcc14ffb6910287b97dce89aac933ca..0937d5d1b375ed88b0c8964beafcec2f5c93dac2 100644 --- a/AUTHORS +++ b/AUTHORS @@ -6,10 +6,12 @@ Adrian Grange Alex Converse Alexis Ballier Alok Ahuja +Alpha Lam Andoni Morales Alastruey Andres Mejia Aron Rosenberg Attila Nagy +Deb Mukherjee Fabio Pedretti Frank Galligan Fredrik Söderquist @@ -21,6 +23,7 @@ Henrik Lundin James Berry James Zern Jan Kratochvil +Jeff Faust Jeff Muizelaar Jim Bankoski Johann Koenig @@ -28,9 +31,11 @@ John Koleszar Joshua Bleecher Snyder Justin Clift Justin Lebar +KO Myung-Hun Lou Quillio Luca Barbato Makoto Kato +Marco Paniconi Martin Ettl Michael Kohler Mike Hommey @@ -40,12 +45,15 @@ Patrik Westin Paul Wilkins Pavol Rusnak Philip Jägenstedt +Priit Laes Rafael Ávila de Espíndola +Rafaël Carré Ralph Giles Ronald S. Bultje Scott LaVarnway Stefan Holmer Taekhyun Kim +Takanori MATSUURA Tero Rintaluoma Thijs Vermeir Timothy B. Terriberry diff --git a/CHANGELOG b/CHANGELOG index f560d054453e4d3515d4764d736e5e82c33c4cb8..dcb9f738a680fc7ef83aaa9b1305cdf33989c071 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,143 @@ +2012-05-09 v1.1.0 "Eider" + This introduces a number of enhancements, mostly focused on real-time + encoding. In addition, it fixes a decoder bug (first introduced in + Duclair) so all users of that release are encouraged to upgrade. + + - Upgrading: + This release is ABI and API compatible with Duclair (v1.0.0). Users + of older releases should refer to the Upgrading notes in this + document for that release. + + This release introduces a new temporal denoiser, controlled by the + VP8E_SET_NOISE_SENSITIVITY control. The temporal denoiser does not + currently take a strength parameter, so the control is effectively + a boolean - zero (off) or non-zero (on). For compatibility with + existing applications, the values accepted are the same as those + for the spatial denoiser (0-6). The temporal denoiser is enabled + by default, and the older spatial denoiser may be restored by + configuring with --disable-temporal-denoising. The temporal denoiser + is more computationally intensive than the spatial one. + + This release removes support for a legacy, decode only API that was + supported, but deprecated, at the initial release of libvpx + (v0.9.0). This is not expected to have any impact. If you are + impacted, you can apply a reversion to commit 2bf8fb58 locally. + Please update to the latest libvpx API if you are affected. + + - Enhancements: + Adds a motion compensated temporal denoiser to the encoder, which + gives higher quality than the older spatial denoiser. (See above + for notes on upgrading). + + In addition, support for new compilers and platforms were added, + including: + improved support for XCode + Android x86 NDK build + OS/2 support + SunCC support + + Changing resolution with vpx_codec_enc_config_set() is now + supported. Previously, reinitializing the codec was required to + change the input resolution. + + The vpxenc application has initial support for producing multiple + encodes from the same input in one call. Resizing is not yet + supported, but varying other codec parameters is. Use -- to + delineate output streams. Options persist from one stream to the + next. + + Also, the vpxenc application will now use a keyframe interval of + 5 seconds by default. Use the --kf-max-dist option to override. + + - Speed: + Decoder performance improved 2.5% versus Duclair. Encoder speed is + consistent with Duclair for most material. Two pass encoding of + slideshow-like material will see significant improvements. + + Large realtime encoding speed gains at a small quality expense are + possible by configuring the on-the-fly bitpacking experiment with + --enable-onthefly-bitpacking. Realtime encoder can be up to 13% + faster (ARM) depending on the number of threads and bitrate + settings. This technique sees constant gain over the 5-16 speed + range. For VC style input the loss seen is up to 0.2dB. See commit + 52cf4dca for further details. + + - Quality: + On the whole, quality is consistent with the Duclair release. Some + tweaks: + + Reduced blockiness in easy sections by applying a penalty to + intra modes. + + Improved quality of static sections (like slideshows) with + two pass encoding. + + Improved keyframe sizing with multiple temporal layers + + - Bug Fixes: + Corrected alt-ref contribution to frame rate for visible updates + to the alt-ref buffer. This affected applications making manual + usage of the frame reference flags, or temporal layers. + + Additional constraints were added to disable multi-frame quality + enhancement (MFQE) in sections of the frame where there is motion. + (#392) + + Fixed corruption issues when vpx_codec_enc_config_set() was called + with spatial resampling enabled. + + Fixed a decoder error introduced in Duclair where the segmentation + map was not being reinitialized on keyframes (#378) + + +2012-01-27 v1.0.0 "Duclair" + Our fourth named release, focused on performance and features related to + real-time encoding. It also fixes a decoder crash bug introduced in + v0.9.7, so all users of that release are encouraged to upgrade. + + - Upgrading: + This release is ABI incompatible with prior releases of libvpx, so the + "major" version number has been bumped to 1. You must recompile your + applications against the latest version of the libvpx headers. The + API remains compatible, and this should not require code changes in most + applications. + + - Enhancements: + This release introduces several substantial new features to the encoder, + of particular interest to real time streaming applications. + + Temporal scalability allows the encoder to produce a stream that can + be decimated to different frame rates, with independent rate targetting + for each substream. + + Multiframe quality enhancement postprocessing can make visual quality + more consistent in the presence of frames that are substantially + different quality than the surrounding frames, as in the temporal + scalability case and in some forced keyframe scenarios. + + Multiple-resolution encoding support allows the encoding of the + same content at different resolutions faster than encoding them + separately. + + - Speed: + Optimization targets for this release included the decoder and the real- + time modes of the encoder. Decoder speed on x86 has improved 10.5% with + this release. Encoder improvements followed a curve where speeds 1-3 + improved 4.0%-1.5%, speeds 4-8 improved <1%, and speeds 9-16 improved + 1.5% to 10.5%, respectively. "Best" mode speed is consistent with the + Cayuga release. + + - Quality: + Encoder quality in the single stream case is consistent with the Cayuga + release. + + - Bug Fixes: + This release fixes an OOB read decoder crash bug present in v0.9.7 + related to the clamping of motion vectors in SPLITMV blocks. This + behavior could be triggered by corrupt input or by starting + decoding from a P-frame. + + 2011-08-15 v0.9.7-p1 "Cayuga" patch 1 This is an incremental bugfix release against Cayuga. All users of that release are strongly encouraged to upgrade. diff --git a/LICENSE b/LICENSE index 7a6f99547d4d6b4c5e6d5321acfaeba313e08de3..1ce44343c4a325e9e1189fd98bb44a9f2f096da9 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2010, Google Inc. All rights reserved. +Copyright (c) 2010, The WebM Project authors. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -12,9 +12,10 @@ met: the documentation and/or other materials provided with the distribution. - * Neither the name of Google nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. + * Neither the name of Google, nor the WebM Project, nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT diff --git a/README b/README index dddc5eae4c7b5aedc778f0e6d0f68d35bafb7063..0475dad791ea51899d97a13c0dfe025f82e23fc4 100644 --- a/README +++ b/README @@ -1,5 +1,5 @@ vpx Multi-Format Codec SDK -README - 19 May 2010 +README - 21 June 2012 Welcome to the WebM VP8 Codec SDK! @@ -15,11 +15,19 @@ COMPILING THE APPLICATIONS/LIBRARIES: * Building the documentation requires PHP[3] and Doxygen[4]. If you do not have these packages, you must pass --disable-install-docs to the configure script. + * Downloading the data for the unit tests requires curl[5] and sha1sum. + sha1sum is provided via the GNU coreutils, installed by default on + many *nix platforms, as well as MinGW and Cygwin. If coreutils is not + available, a compatible version of sha1sum can be built from + source[6]. These requirements are optional if not running the unit + tests. [1]: http://www.tortall.net/projects/yasm [2]: http://www.cygwin.com [3]: http://php.net [4]: http://www.doxygen.org + [5]: http://curl.haxx.se + [6]: http://www.microbrew.org/tools/md5sha1sum/ 2. Out-of-tree builds Out of tree builds are a supported method of building the application. For @@ -42,17 +50,13 @@ COMPILING THE APPLICATIONS/LIBRARIES: --help output of the configure script. As of this writing, the list of available targets is: + armv5te-android-gcc armv5te-linux-rvct armv5te-linux-gcc - armv5te-symbian-gcc armv6-darwin-gcc armv6-linux-rvct armv6-linux-gcc - armv6-symbian-gcc - iwmmxt-linux-rvct - iwmmxt-linux-gcc - iwmmxt2-linux-rvct - iwmmxt2-linux-gcc + armv7-android-gcc armv7-linux-rvct armv7-linux-gcc mips32-linux-gcc @@ -98,5 +102,5 @@ COMPILING THE APPLICATIONS/LIBRARIES: SUPPORT This library is an open source project supported by its community. Please - please email webm-users@webmproject.org for help. + please email webm-discuss@webmproject.org for help. diff --git a/configure b/configure index 638d0df1ebff36458f87f75d0c36b4389d3317e5..bd3bf1641c971b291c38ea12cd4277461c7ac105 100755 --- a/configure +++ b/configure @@ -20,27 +20,36 @@ show_help(){ show_help_pre cat << EOF Advanced options: - ${toggle_libs} don't build libraries - ${toggle_examples} don't build examples - ${toggle_unit_tests} build unit tests + ${toggle_libs} libraries + ${toggle_examples} examples + ${toggle_docs} documentation + ${toggle_unit_tests} unit tests --libc=PATH path to alternate libc --as={yasm|nasm|auto} use specified assembler [auto, yasm preferred] + --sdk-path=PATH path to root of sdk (iOS, android builds only) ${toggle_fast_unaligned} don't use unaligned accesses, even when supported by hardware [auto] ${toggle_codec_srcs} in/exclude codec library source code ${toggle_debug_libs} in/exclude debug version of libraries ${toggle_md5} support for output of checksum data ${toggle_static_msvcrt} use static MSVCRT (VS builds only) + ${toggle_vp8} VP8 codec support ${toggle_vp9} VP9 codec support ${toggle_internal_stats} output of encoder internal stats for debug, if supported (encoders) ${toggle_mem_tracker} track memory usage ${toggle_postproc} postprocessing + ${toggle_multithread} multithreaded encoding and decoding ${toggle_spatial_resampling} spatial sampling (scaling) support + ${toggle_realtime_only} enable this option while building for real-time encoding + ${toggle_onthefly_bitpacking} enable on-the-fly bitpacking in real-time encoding + ${toggle_error_concealment} enable this option to get a decoder which is able to conceal losses ${toggle_runtime_cpu_detect} runtime cpu detection ${toggle_shared} shared library support ${toggle_static} static library support ${toggle_small} favor smaller size over speed ${toggle_postproc_visualizer} macro block / block level visualizers + ${toggle_multi_res_encoding} enable multiple-resolution encoding + ${toggle_temporal_denoising} enable temporal denoising and disable the spatial denoiser Codecs: Codecs can be selectively enabled or disabled individually, or by family: @@ -76,19 +85,15 @@ EOF # all_platforms is a list of all supported target platforms. Maintain # alphabetically by architecture, generic-gnu last. +all_platforms="${all_platforms} armv5te-android-gcc" all_platforms="${all_platforms} armv5te-linux-rvct" all_platforms="${all_platforms} armv5te-linux-gcc" all_platforms="${all_platforms} armv5te-none-rvct" -all_platforms="${all_platforms} armv5te-symbian-gcc" all_platforms="${all_platforms} armv6-darwin-gcc" all_platforms="${all_platforms} armv6-linux-rvct" all_platforms="${all_platforms} armv6-linux-gcc" all_platforms="${all_platforms} armv6-none-rvct" -all_platforms="${all_platforms} armv6-symbian-gcc" -all_platforms="${all_platforms} iwmmxt-linux-rvct" -all_platforms="${all_platforms} iwmmxt-linux-gcc" -all_platforms="${all_platforms} iwmmxt2-linux-rvct" -all_platforms="${all_platforms} iwmmxt2-linux-gcc" +all_platforms="${all_platforms} armv7-android-gcc" #neon Cortex-A8 all_platforms="${all_platforms} armv7-darwin-gcc" #neon Cortex-A8 all_platforms="${all_platforms} armv7-linux-rvct" #neon Cortex-A8 all_platforms="${all_platforms} armv7-linux-gcc" #neon Cortex-A8 @@ -105,8 +110,12 @@ all_platforms="${all_platforms} x86-darwin8-gcc" all_platforms="${all_platforms} x86-darwin8-icc" all_platforms="${all_platforms} x86-darwin9-gcc" all_platforms="${all_platforms} x86-darwin9-icc" +all_platforms="${all_platforms} x86-darwin10-gcc" +all_platforms="${all_platforms} x86-darwin11-gcc" +all_platforms="${all_platforms} x86-darwin12-gcc" all_platforms="${all_platforms} x86-linux-gcc" all_platforms="${all_platforms} x86-linux-icc" +all_platforms="${all_platforms} x86-os2-gcc" all_platforms="${all_platforms} x86-solaris-gcc" all_platforms="${all_platforms} x86-win32-gcc" all_platforms="${all_platforms} x86-win32-vs7" @@ -115,13 +124,18 @@ all_platforms="${all_platforms} x86-win32-vs9" all_platforms="${all_platforms} x86_64-darwin9-gcc" all_platforms="${all_platforms} x86_64-darwin10-gcc" all_platforms="${all_platforms} x86_64-darwin11-gcc" +all_platforms="${all_platforms} x86_64-darwin12-gcc" all_platforms="${all_platforms} x86_64-linux-gcc" all_platforms="${all_platforms} x86_64-linux-icc" all_platforms="${all_platforms} x86_64-solaris-gcc" +all_platforms="${all_platforms} x86_64-win64-gcc" all_platforms="${all_platforms} x86_64-win64-vs8" all_platforms="${all_platforms} x86_64-win64-vs9" all_platforms="${all_platforms} universal-darwin8-gcc" all_platforms="${all_platforms} universal-darwin9-gcc" +all_platforms="${all_platforms} universal-darwin10-gcc" +all_platforms="${all_platforms} universal-darwin11-gcc" +all_platforms="${all_platforms} universal-darwin12-gcc" all_platforms="${all_platforms} generic-gnu" # all_targets is a list of all targets that can be configured @@ -158,20 +172,29 @@ enable optimizations enable fast_unaligned #allow unaligned accesses, if supported by hw enable md5 enable spatial_resampling +enable multithread enable os_support +enable temporal_denoising [ -d ${source_path}/../include ] && enable alt_tree_layout -for d in vp9; do +for d in vp8 vp9; do [ -d ${source_path}/${d} ] && disable alt_tree_layout; done if ! enabled alt_tree_layout; then # development environment +[ -d ${source_path}/vp8 ] && CODECS="${CODECS} vp8_encoder vp8_decoder" [ -d ${source_path}/vp9 ] && CODECS="${CODECS} vp9_encoder vp9_decoder" else # customer environment -[ -f ${source_path}/../include/vpx/vp8cx.h ] && CODECS="${CODECS} vp9_encoder" -[ -f ${source_path}/../include/vpx/vp8dx.h ] && CODECS="${CODECS} vp9_decoder" +[ -f ${source_path}/../include/vpx/vp8cx.h ] && CODECS="${CODECS} vp8_encoder" +[ -f ${source_path}/../include/vpx/vp8dx.h ] && CODECS="${CODECS} vp8_decoder" +[ -f ${source_path}/../include/vpx/vp9cx.h ] && CODECS="${CODECS} vp9_encoder" +[ -f ${source_path}/../include/vpx/vp9dx.h ] && CODECS="${CODECS} vp9_decoder" +[ -f ${source_path}/../include/vpx/vp8cx.h ] || disable vp8_encoder +[ -f ${source_path}/../include/vpx/vp8dx.h ] || disable vp8_decoder +[ -f ${source_path}/../include/vpx/vp9cx.h ] || disable vp9_encoder +[ -f ${source_path}/../include/vpx/vp9dx.h ] || disable vp9_decoder [ -f ${source_path}/../lib/*/*mt.lib ] && soft_enable static_msvcrt fi @@ -188,13 +211,12 @@ ARCH_LIST=" ppc64 " ARCH_EXT_LIST=" - armv5te - armv6 - armv7 - iwmmxt - iwmmxt2 + edsp + media + neon mips32 + dspr2 mmx sse @@ -252,6 +274,7 @@ CONFIG_LIST=" dc_recon runtime_cpu_detect postproc + multithread internal_stats ${CODECS} ${CODEC_FAMILIES} @@ -259,12 +282,17 @@ CONFIG_LIST=" decoders static_msvcrt spatial_resampling + realtime_only + onthefly_bitpacking + error_concealment shared static small postproc_visualizer os_support unit_tests + multi_res_encoding + temporal_denoising experimental ${EXPERIMENT_LIST} " @@ -285,6 +313,7 @@ CMDLINE_SELECT=" libs examples + docs libc as fast_unaligned @@ -295,17 +324,23 @@ CMDLINE_SELECT=" dequant_tokens dc_recon postproc + multithread internal_stats ${CODECS} ${CODEC_FAMILIES} static_msvcrt mem_tracker spatial_resampling + realtime_only + onthefly_bitpacking + error_concealment shared static small postproc_visualizer unit_tests + multi_res_encoding + temporal_denoising experimental " @@ -394,6 +429,7 @@ process_targets() { enabled debug_libs && DIST_DIR="${DIST_DIR}-debug" enabled codec_srcs && DIST_DIR="${DIST_DIR}-src" ! enabled postproc && DIST_DIR="${DIST_DIR}-nopost" + ! enabled multithread && DIST_DIR="${DIST_DIR}-nomt" ! enabled install_docs && DIST_DIR="${DIST_DIR}-nodocs" DIST_DIR="${DIST_DIR}-${tgt_isa}-${tgt_os}" case "${tgt_os}" in @@ -448,6 +484,18 @@ EOF } process_detect() { + if enabled shared; then + # Can only build shared libs on a subset of platforms. Doing this check + # here rather than at option parse time because the target auto-detect + # magic happens after the command line has been parsed. + if ! enabled linux; then + if enabled gnu; then + echo "--enable-shared is only supported on ELF; assuming this is OK" + else + die "--enable-shared only supported on ELF for now" + fi + fi + fi if [ -z "$CC" ]; then echo "Bypassing toolchain for environment detection." enable external_build @@ -492,11 +540,20 @@ process_toolchain() { case $toolchain in universal-darwin*) local darwin_ver=${tgt_os##darwin} - fat_bin_archs="$fat_bin_archs ppc32-${tgt_os}-gcc" - # Intel - fat_bin_archs="$fat_bin_archs x86-${tgt_os}-${tgt_cc}" - if [ $darwin_ver -gt 8 ]; then + # Snow Leopard (10.6/darwin10) dropped support for PPC + # Include PPC support for all prior versions + if [ $darwin_ver -lt 10 ]; then + fat_bin_archs="$fat_bin_archs ppc32-${tgt_os}-gcc" + fi + + # Tiger (10.4/darwin8) brought support for x86 + if [ $darwin_ver -ge 8 ]; then + fat_bin_archs="$fat_bin_archs x86-${tgt_os}-${tgt_cc}" + fi + + # Leopard (10.5/darwin9) brought 64 bit support + if [ $darwin_ver -ge 9 ]; then fat_bin_archs="$fat_bin_archs x86_64-${tgt_os}-${tgt_cc}" fi ;; @@ -512,8 +569,11 @@ process_toolchain() { check_add_cflags -Wpointer-arith check_add_cflags -Wtype-limits check_add_cflags -Wcast-qual - check_add_cflags -Wundef check_add_cflags -Wvla + check_add_cflags -Wimplicit-function-declaration + check_add_cflags -Wuninitialized + check_add_cflags -Wunused-variable + check_add_cflags -Wunused-but-set-variable enabled extra_warnings || check_add_cflags -Wno-unused-function fi @@ -568,6 +628,21 @@ process_toolchain() { if enabled postproc_visualizer; then enabled postproc || die "postproc_visualizer requires postproc to be enabled" fi + + # Enable unit tests if we have a working C++ compiler + case "$toolchain" in + *-vs*) + soft_enable unit_tests + ;; + *-android-*) + # GTestLog must be modified to use Android logging utilities. + ;; + *) + check_cxx "$@" < ${BUILD_PFX}vpx_config.c +print_webm_license ${BUILD_PFX}vpx_config.c "/*" " */" +cat <> ${BUILD_PFX}vpx_config.c static const char* const cfg = "$CONFIGURE_ARGS"; const char *vpx_codec_build_config(void) {return cfg;} EOF diff --git a/docs.mk b/docs.mk index 0d448b86c5a45b73e362b1d85c23202b1eeddf71..cfe57edd9bb0147865f9a002ad12c0553eb7ef4e 100644 --- a/docs.mk +++ b/docs.mk @@ -21,9 +21,6 @@ CODEC_DOX := mainpage.dox \ usage_dx.dox \ # Other doxy files sourced in Markdown -TXT_DOX-$(CONFIG_VP9) += vp8_api1_migration.dox -vp8_api1_migration.dox.DESC = VP8 API 1.x Migration - TXT_DOX = $(call enabled,TXT_DOX) %.dox: %.txt diff --git a/examples.mk b/examples.mk index 74fb6815652bfa4c77fcb003311c067fb4f1dbe3..0d4b4d5a92f1f4f52b2a347256db247bb467c610 100644 --- a/examples.mk +++ b/examples.mk @@ -16,7 +16,7 @@ UTILS-$(CONFIG_DECODERS) += vpxdec.c vpxdec.SRCS += md5_utils.c md5_utils.h vpxdec.SRCS += vpx_ports/vpx_timer.h vpxdec.SRCS += vpx/vpx_integer.h -vpxdec.SRCS += args.c args.h vpx_ports/config.h +vpxdec.SRCS += args.c args.h vpxdec.SRCS += tools_common.c tools_common.h vpxdec.SRCS += nestegg/halloc/halloc.h vpxdec.SRCS += nestegg/halloc/src/align.h @@ -30,13 +30,17 @@ vpxdec.DESCRIPTION = Full featured decoder UTILS-$(CONFIG_ENCODERS) += vpxenc.c vpxenc.SRCS += args.c args.h y4minput.c y4minput.h vpxenc.SRCS += tools_common.c tools_common.h -vpxenc.SRCS += vpx_ports/config.h vpx_ports/mem_ops.h +vpxenc.SRCS += vpx_ports/mem_ops.h vpxenc.SRCS += vpx_ports/mem_ops_aligned.h +vpxenc.SRCS += vpx_ports/vpx_timer.h vpxenc.SRCS += libmkv/EbmlIDs.h vpxenc.SRCS += libmkv/EbmlWriter.c vpxenc.SRCS += libmkv/EbmlWriter.h vpxenc.GUID = 548DEC74-7A15-4B2B-AFC3-AA102E7C25C1 vpxenc.DESCRIPTION = Full featured encoder +UTILS-$(CONFIG_ENCODERS) += vp8_scalable_patterns.c +vp8_scalable_patterns.GUID = 0D6A210B-F482-4D6F-8570-4A9C01ACC88C +vp8_scalable_patterns.DESCRIPTION = Temporal Scalability Encoder # Clean up old ivfenc, ivfdec binaries. ifeq ($(CONFIG_MSVS),yes) @@ -77,29 +81,44 @@ GEN_EXAMPLES-$(CONFIG_ENCODERS) += decode_with_drops.c endif decode_with_drops.GUID = CE5C53C4-8DDA-438A-86ED-0DDD3CDB8D26 decode_with_drops.DESCRIPTION = Drops frames while decoding +ifeq ($(CONFIG_DECODERS),yes) +GEN_EXAMPLES-$(CONFIG_ERROR_CONCEALMENT) += decode_with_partial_drops.c +endif +decode_with_partial_drops.GUID = 61C2D026-5754-46AC-916F-1343ECC5537E +decode_with_partial_drops.DESCRIPTION = Drops parts of frames while decoding GEN_EXAMPLES-$(CONFIG_ENCODERS) += error_resilient.c error_resilient.GUID = DF5837B9-4145-4F92-A031-44E4F832E00C error_resilient.DESCRIPTION = Error Resiliency Feature -GEN_EXAMPLES-$(CONFIG_VP9_ENCODER) += vp8_scalable_patterns.c -vp8_scalable_patterns.GUID = 0D6A210B-F482-4D6F-8570-4A9C01ACC88C -vp8_scalable_patterns.DESCRIPTION = VP8 Scalable Bitstream Patterns -GEN_EXAMPLES-$(CONFIG_VP9_ENCODER) += vp8_set_maps.c +GEN_EXAMPLES-$(CONFIG_VP8_ENCODER) += vp8_set_maps.c vp8_set_maps.GUID = ECB2D24D-98B8-4015-A465-A4AF3DCC145F vp8_set_maps.DESCRIPTION = VP8 set active and ROI maps -GEN_EXAMPLES-$(CONFIG_VP9_ENCODER) += vp8cx_set_ref.c +GEN_EXAMPLES-$(CONFIG_VP8_ENCODER) += vp8cx_set_ref.c vp8cx_set_ref.GUID = C5E31F7F-96F6-48BD-BD3E-10EBF6E8057A vp8cx_set_ref.DESCRIPTION = VP8 set encoder reference frame +# C file is provided, not generated automatically. +UTILS-$(CONFIG_MULTI_RES_ENCODING) += vp8_multi_resolution_encoder.c +vp8_multi_resolution_encoder.SRCS \ + += third_party/libyuv/include/libyuv/basic_types.h \ + third_party/libyuv/include/libyuv/cpu_id.h \ + third_party/libyuv/include/libyuv/scale.h \ + third_party/libyuv/source/row.h \ + third_party/libyuv/source/scale.c \ + third_party/libyuv/source/cpu_id.c +vp8_multi_resolution_encoder.GUID = 04f8738e-63c8-423b-90fa-7c2703a374de +vp8_multi_resolution_encoder.DESCRIPTION = VP8 Multiple-resolution Encoding # Handle extra library flags depending on codec configuration # We should not link to math library (libm) on RVCT # when building for bare-metal targets ifeq ($(CONFIG_OS_SUPPORT), yes) +CODEC_EXTRA_LIBS-$(CONFIG_VP8) += m CODEC_EXTRA_LIBS-$(CONFIG_VP9) += m else ifeq ($(CONFIG_GCC), yes) + CODEC_EXTRA_LIBS-$(CONFIG_VP8) += m CODEC_EXTRA_LIBS-$(CONFIG_VP9) += m endif endif @@ -117,6 +136,8 @@ ifeq ($(HAVE_ALT_TREE_LAYOUT),yes) INC_PATH := $(SRC_PATH_BARE)/../include else LIB_PATH-yes += $(if $(BUILD_PFX),$(BUILD_PFX),.) + INC_PATH-$(CONFIG_VP8_DECODER) += $(SRC_PATH_BARE)/vp8 + INC_PATH-$(CONFIG_VP8_ENCODER) += $(SRC_PATH_BARE)/vp8 INC_PATH-$(CONFIG_VP9_DECODER) += $(SRC_PATH_BARE)/vp9 INC_PATH-$(CONFIG_VP9_ENCODER) += $(SRC_PATH_BARE)/vp9 LIB_PATH := $(call enabled,LIB_PATH) @@ -152,12 +173,12 @@ $(eval $(if $(filter universal%,$(TOOLCHAIN)),LIPO_OBJS,BUILD_OBJS):=yes) # Create build/install dependencies for all examples. The common case # is handled here. The MSVS case is handled below. NOT_MSVS = $(if $(CONFIG_MSVS),,yes) -DIST-BINS-$(NOT_MSVS) += $(addprefix bin/,$(ALL_EXAMPLES:.c=)) -INSTALL-BINS-$(NOT_MSVS) += $(addprefix bin/,$(UTILS:.c=)) +DIST-BINS-$(NOT_MSVS) += $(addprefix bin/,$(ALL_EXAMPLES:.c=$(EXE_SFX))) +INSTALL-BINS-$(NOT_MSVS) += $(addprefix bin/,$(UTILS:.c=$(EXE_SFX))) DIST-SRCS-yes += $(ALL_SRCS) INSTALL-SRCS-yes += $(UTIL_SRCS) OBJS-$(NOT_MSVS) += $(if $(BUILD_OBJS),$(call objs,$(ALL_SRCS))) -BINS-$(NOT_MSVS) += $(addprefix $(BUILD_PFX),$(ALL_EXAMPLES:.c=)) +BINS-$(NOT_MSVS) += $(addprefix $(BUILD_PFX),$(ALL_EXAMPLES:.c=$(EXE_SFX))) # Instantiate linker template for all examples. @@ -168,7 +189,7 @@ $(foreach bin,$(BINS-yes),\ $(if $(BUILD_OBJS),$(eval $(bin):\ $(LIB_PATH)/lib$(CODEC_LIB)$(CODEC_LIB_SUF)))\ $(if $(BUILD_OBJS),$(eval $(call linker_template,$(bin),\ - $(call objs,$($(notdir $(bin)).SRCS)) \ + $(call objs,$($(notdir $(bin:$(EXE_SFX)=)).SRCS)) \ -l$(CODEC_LIB) $(addprefix -l,$(CODEC_EXTRA_LIBS))\ )))\ $(if $(LIPO_OBJS),$(eval $(call lipo_bin_template,$(bin))))\ diff --git a/examples/decoder_tmpl.txt b/examples/decoder_tmpl.txt index 90a9b2cc951be2c13018d003acce76b868daad72..3d230a5ae1f064fcbb356ef76591083e7cdbcf0a 100644 --- a/examples/decoder_tmpl.txt +++ b/examples/decoder_tmpl.txt @@ -48,8 +48,8 @@ for(plane=0; plane < 3; plane++) { unsigned char *buf =img->planes[plane]; for(y=0; y < (plane ? (img->d_h + 1) >> 1 : img->d_h); y++) { - if(fwrite(buf, 1, (plane ? (img->d_w + 1) >> 1 : img->d_w), - outfile)); + (void) fwrite(buf, 1, (plane ? (img->d_w + 1) >> 1 : img->d_w), + outfile); buf += img->stride[plane]; } } diff --git a/examples/encoder_tmpl.c b/examples/encoder_tmpl.c index cc70b0081056c1e1b6856a5b0eefe43e3cb61168..e2b65ecb60ca84f6cdb955c7bd0206c7a2469570 100644 --- a/examples/encoder_tmpl.c +++ b/examples/encoder_tmpl.c @@ -85,7 +85,7 @@ static void write_ivf_file_header(FILE *outfile, mem_put_le32(header+24, frame_cnt); /* length */ mem_put_le32(header+28, 0); /* unused */ - if(fwrite(header, 1, 32, outfile)); + (void) fwrite(header, 1, 32, outfile); } @@ -103,7 +103,7 @@ static void write_ivf_frame_header(FILE *outfile, mem_put_le32(header+4, pts&0xFFFFFFFF); mem_put_le32(header+8, pts >> 32); - if(fwrite(header, 1, 12, outfile)); + (void) fwrite(header, 1, 12, outfile); } int main(int argc, char **argv) { diff --git a/examples/encoder_tmpl.txt b/examples/encoder_tmpl.txt index afc1193675bee374baa3ad571800db1478b05d11..9f8f4afd57bd7dd3be2fc6ebf2b8f6a7771c8a9e 100644 --- a/examples/encoder_tmpl.txt +++ b/examples/encoder_tmpl.txt @@ -61,13 +61,14 @@ if(vpx_codec_encode(&codec, frame_avail? &raw : NULL, frame_cnt, ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PROCESS_FRAME case VPX_CODEC_CX_FRAME_PKT: write_ivf_frame_header(outfile, pkt); - if(fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, - outfile)); + (void) fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, + outfile); break; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PROCESS_FRAME ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DESTROY +vpx_img_free(&raw); if(vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec"); ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DESTROY diff --git a/examples/postproc.txt b/examples/postproc.txt index 40f75f2c572e46d9ebe46e8b128918467464be33..e00bf59c01597cfae15563b9a07052ad5aec150e 100644 --- a/examples/postproc.txt +++ b/examples/postproc.txt @@ -58,7 +58,7 @@ if(frame_cnt%30 == 1) { if(vpx_codec_control(&codec, VP8_SET_POSTPROC, &pp)) die_codec(&codec, "Failed to turn off postproc"); } else if(frame_cnt%30 == 16) { - vp8_postproc_cfg_t pp = {VP8_DEBLOCK | VP8_DEMACROBLOCK, 4, 0}; + vp8_postproc_cfg_t pp = {VP8_DEBLOCK | VP8_DEMACROBLOCK | VP8_MFQE, 4, 0}; if(vpx_codec_control(&codec, VP8_SET_POSTPROC, &pp)) die_codec(&codec, "Failed to turn on postproc"); diff --git a/examples/twopass_encoder.txt b/examples/twopass_encoder.txt index 4683bc708a6c5ca917d0b6a056410d46ad6729e9..2f81a9018c97c295eee2af2f9cc91b359b555e13 100644 --- a/examples/twopass_encoder.txt +++ b/examples/twopass_encoder.txt @@ -71,5 +71,17 @@ Pass Progress Reporting It's sometimes helpful to see when each pass completes. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TWOPASS_LOOP_END printf("Pass %d complete.\n", pass+1); + if(vpx_codec_destroy(&codec)) + die_codec(&codec, "Failed to destroy codec"); } ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TWOPASS_LOOP_END + + +Clean-up +----------------------------- +Destruction of the encoder instance must be done on each pass. The +raw image should be destroyed at the end as usual. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DESTROY +vpx_img_free(&raw); +free(stats.buf); +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DESTROY diff --git a/examples/vp8_scalable_patterns.txt b/examples/vp8_scalable_patterns.txt deleted file mode 100644 index e1d5dbdaadf26eca6032b045eed2d6c8c2a94765..0000000000000000000000000000000000000000 --- a/examples/vp8_scalable_patterns.txt +++ /dev/null @@ -1,143 +0,0 @@ -@TEMPLATE encoder_tmpl.c -VP8 Scalable Frame Patterns -=========================== -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTRODUCTION -This is an example demonstrating how to control the VP8 encoder's -reference frame selection and update mechanism for video applications -that benefit from a scalable bitstream. -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTRODUCTION - - -Configuration -------------- -Scalable frame patterns are most useful in an error resilient context, -so error resiliency mode is enabled, as in the `error_resilient.c` -example. In addition, we want to disable automatic keyframe selection, -so we force an interval of 1000 frames. -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ENC_SET_CFG2 - -/* Enable error resilient mode */ -cfg.g_error_resilient = 1; -cfg.g_lag_in_frames = 0; -cfg.kf_mode = VPX_KF_FIXED; - -/* Disable automatic keyframe placement */ -cfg.kf_min_dist = cfg.kf_max_dist = 1000; -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ENC_SET_CFG2 - -This example uses the following frame pattern (L->last_frame, -G->golden_frame, A->alt_ref_frame): - -* Frame 0 Intra, use none, update L&G&A -* Frame 1 Inter, use LGA, update none -* Frame 2 Inter, use LGA, update L -* Frame 3 Inter, use LGA, update none -* Frame 4 Inter, use GA, update L&G -* Frame 5 Inter, use LGA, update none -* Frame 6 Inter, use LGA, update L -* Frame 7 Inter, use LGA, update none -* Frame 8 Inter, use A, update L&G&A -* Frame 9 Inter, use LGA, update none -* Frame 10 Inter, use LGA, update L -* Frame 11 Inter, use LGA, update none -* Frame 12 Inter, use GA, update L&G -* Frame 13 Inter, use LGA, update none -* Frame 14 Inter, use LGA, update L -* Frame 15 Inter, use LGA, update none -* ...Repeats the pattern from frame 0 - -Change this variable to test the 3 decodable streams case. -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TWOPASS_VARS -int num_streams = 5; -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TWOPASS_VARS - - -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PER_FRAME_CFG -flags = 0; -if(num_streams == 5) -{ - switch(frame_cnt % 16) { - case 0: - flags |= VPX_EFLAG_FORCE_KF; - flags |= VP8_EFLAG_FORCE_GF; - flags |= VP8_EFLAG_FORCE_ARF; - break; - case 1: - case 3: - case 5: - case 7: - case 9: - case 11: - case 13: - case 15: - flags |= VP8_EFLAG_NO_UPD_LAST; - flags |= VP8_EFLAG_NO_UPD_GF; - flags |= VP8_EFLAG_NO_UPD_ARF; - break; - case 2: - case 6: - case 10: - case 14: - break; - case 4: - flags |= VP8_EFLAG_NO_REF_LAST; - flags |= VP8_EFLAG_FORCE_GF; - break; - case 8: - flags |= VP8_EFLAG_NO_REF_LAST; - flags |= VP8_EFLAG_NO_REF_GF; - flags |= VP8_EFLAG_FORCE_GF; - flags |= VP8_EFLAG_FORCE_ARF; - break; - case 12: - flags |= VP8_EFLAG_NO_REF_LAST; - flags |= VP8_EFLAG_FORCE_GF; - break; - } -} -else -{ - switch(frame_cnt % 9) { - case 0: - if(frame_cnt==0) - { - flags |= VPX_EFLAG_FORCE_KF; - } - else - { - cfg.rc_max_quantizer = 26; - cfg.rc_min_quantizer = 0; - cfg.rc_target_bitrate = 300; - flags |= VP8_EFLAG_NO_REF_LAST; - flags |= VP8_EFLAG_NO_REF_ARF; - } - flags |= VP8_EFLAG_FORCE_GF; - flags |= VP8_EFLAG_FORCE_ARF; - break; - case 1: - case 2: - case 4: - case 5: - case 7: - case 8: - cfg.rc_max_quantizer = 45; - cfg.rc_min_quantizer = 0; - cfg.rc_target_bitrate = 230; - break; - case 3: - case 6: - cfg.rc_max_quantizer = 45; - cfg.rc_min_quantizer = 0; - cfg.rc_target_bitrate = 215; - flags |= VP8_EFLAG_NO_REF_LAST; - flags |= VP8_EFLAG_FORCE_ARF; - break; - } -} -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PER_FRAME_CFG - -Observing The Effects ---------------------- -Use the `decode_with_drops` example to decode with various dropped frame -patterns. Good patterns to start with are 1/2, 3/4, 7/8, and 15/16 -drops. diff --git a/libmkv/EbmlIDs.h b/libmkv/EbmlIDs.h index c6c4a696005149ab9ebd4950a9c1bce1250e3e9f..4920bf9baed91bfc3203266c5fe64c2175d85bac 100644 --- a/libmkv/EbmlIDs.h +++ b/libmkv/EbmlIDs.h @@ -1,16 +1,16 @@ -// Copyright (c) 2010 The WebM project authors. All Rights Reserved. -// -// Use of this source code is governed by a BSD-style license -// that can be found in the LICENSE file in the root of the source -// tree. An additional intellectual property rights grant can be found -// in the file PATENTS. All contributing project authors may -// be found in the AUTHORS file in the root of the source tree. - - +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ #ifndef MKV_DEFS_HPP #define MKV_DEFS_HPP 1 -// Commenting out values not available in webm, but available in matroska +/* Commenting out values not available in webm, but available in matroska */ enum mkv { EBML = 0x1A45DFA3, @@ -21,7 +21,7 @@ enum mkv { DocType = 0x4282, DocTypeVersion = 0x4287, DocTypeReadVersion = 0x4285, -// CRC_32 = 0xBF, +/* CRC_32 = 0xBF, */ Void = 0xEC, SignatureSlot = 0x1B538667, SignatureAlgo = 0x7E8A, @@ -31,61 +31,61 @@ enum mkv { SignatureElements = 0x7E5B, SignatureElementList = 0x7E7B, SignedElement = 0x6532, - // segment + /* segment */ Segment = 0x18538067, - // Meta Seek Information + /* Meta Seek Information */ SeekHead = 0x114D9B74, Seek = 0x4DBB, SeekID = 0x53AB, SeekPosition = 0x53AC, - // Segment Information + /* Segment Information */ Info = 0x1549A966, -// SegmentUID = 0x73A4, -// SegmentFilename = 0x7384, -// PrevUID = 0x3CB923, -// PrevFilename = 0x3C83AB, -// NextUID = 0x3EB923, -// NextFilename = 0x3E83BB, -// SegmentFamily = 0x4444, -// ChapterTranslate = 0x6924, -// ChapterTranslateEditionUID = 0x69FC, -// ChapterTranslateCodec = 0x69BF, -// ChapterTranslateID = 0x69A5, +/* SegmentUID = 0x73A4, */ +/* SegmentFilename = 0x7384, */ +/* PrevUID = 0x3CB923, */ +/* PrevFilename = 0x3C83AB, */ +/* NextUID = 0x3EB923, */ +/* NextFilename = 0x3E83BB, */ +/* SegmentFamily = 0x4444, */ +/* ChapterTranslate = 0x6924, */ +/* ChapterTranslateEditionUID = 0x69FC, */ +/* ChapterTranslateCodec = 0x69BF, */ +/* ChapterTranslateID = 0x69A5, */ TimecodeScale = 0x2AD7B1, Segment_Duration = 0x4489, DateUTC = 0x4461, -// Title = 0x7BA9, +/* Title = 0x7BA9, */ MuxingApp = 0x4D80, WritingApp = 0x5741, - // Cluster + /* Cluster */ Cluster = 0x1F43B675, Timecode = 0xE7, -// SilentTracks = 0x5854, -// SilentTrackNumber = 0x58D7, -// Position = 0xA7, +/* SilentTracks = 0x5854, */ +/* SilentTrackNumber = 0x58D7, */ +/* Position = 0xA7, */ PrevSize = 0xAB, BlockGroup = 0xA0, Block = 0xA1, -// BlockVirtual = 0xA2, -// BlockAdditions = 0x75A1, -// BlockMore = 0xA6, -// BlockAddID = 0xEE, -// BlockAdditional = 0xA5, +/* BlockVirtual = 0xA2, */ +/* BlockAdditions = 0x75A1, */ +/* BlockMore = 0xA6, */ +/* BlockAddID = 0xEE, */ +/* BlockAdditional = 0xA5, */ BlockDuration = 0x9B, -// ReferencePriority = 0xFA, +/* ReferencePriority = 0xFA, */ ReferenceBlock = 0xFB, -// ReferenceVirtual = 0xFD, -// CodecState = 0xA4, -// Slices = 0x8E, -// TimeSlice = 0xE8, +/* ReferenceVirtual = 0xFD, */ +/* CodecState = 0xA4, */ +/* Slices = 0x8E, */ +/* TimeSlice = 0xE8, */ LaceNumber = 0xCC, -// FrameNumber = 0xCD, -// BlockAdditionID = 0xCB, -// MkvDelay = 0xCE, -// Cluster_Duration = 0xCF, +/* FrameNumber = 0xCD, */ +/* BlockAdditionID = 0xCB, */ +/* MkvDelay = 0xCE, */ +/* Cluster_Duration = 0xCF, */ SimpleBlock = 0xA3, -// EncryptedBlock = 0xAF, - // Track +/* EncryptedBlock = 0xAF, */ + /* Track */ Tracks = 0x1654AE6B, TrackEntry = 0xAE, TrackNumber = 0xD7, @@ -95,28 +95,28 @@ enum mkv { FlagDefault = 0x88, FlagForced = 0x55AA, FlagLacing = 0x9C, -// MinCache = 0x6DE7, -// MaxCache = 0x6DF8, +/* MinCache = 0x6DE7, */ +/* MaxCache = 0x6DF8, */ DefaultDuration = 0x23E383, -// TrackTimecodeScale = 0x23314F, -// TrackOffset = 0x537F, -// MaxBlockAdditionID = 0x55EE, +/* TrackTimecodeScale = 0x23314F, */ +/* TrackOffset = 0x537F, */ +/* MaxBlockAdditionID = 0x55EE, */ Name = 0x536E, Language = 0x22B59C, CodecID = 0x86, CodecPrivate = 0x63A2, CodecName = 0x258688, -// AttachmentLink = 0x7446, -// CodecSettings = 0x3A9697, -// CodecInfoURL = 0x3B4040, -// CodecDownloadURL = 0x26B240, -// CodecDecodeAll = 0xAA, -// TrackOverlay = 0x6FAB, -// TrackTranslate = 0x6624, -// TrackTranslateEditionUID = 0x66FC, -// TrackTranslateCodec = 0x66BF, -// TrackTranslateTrackID = 0x66A5, - // video +/* AttachmentLink = 0x7446, */ +/* CodecSettings = 0x3A9697, */ +/* CodecInfoURL = 0x3B4040, */ +/* CodecDownloadURL = 0x26B240, */ +/* CodecDecodeAll = 0xAA, */ +/* TrackOverlay = 0x6FAB, */ +/* TrackTranslate = 0x6624, */ +/* TrackTranslateEditionUID = 0x66FC, */ +/* TrackTranslateCodec = 0x66BF, */ +/* TrackTranslateTrackID = 0x66A5, */ + /* video */ Video = 0xE0, FlagInterlaced = 0x9A, StereoMode = 0x53B8, @@ -130,101 +130,101 @@ enum mkv { DisplayHeight = 0x54BA, DisplayUnit = 0x54B2, AspectRatioType = 0x54B3, -// ColourSpace = 0x2EB524, -// GammaValue = 0x2FB523, +/* ColourSpace = 0x2EB524, */ +/* GammaValue = 0x2FB523, */ FrameRate = 0x2383E3, - // end video - // audio + /* end video */ + /* audio */ Audio = 0xE1, SamplingFrequency = 0xB5, OutputSamplingFrequency = 0x78B5, Channels = 0x9F, -// ChannelPositions = 0x7D7B, +/* ChannelPositions = 0x7D7B, */ BitDepth = 0x6264, - // end audio - // content encoding -// ContentEncodings = 0x6d80, -// ContentEncoding = 0x6240, -// ContentEncodingOrder = 0x5031, -// ContentEncodingScope = 0x5032, -// ContentEncodingType = 0x5033, -// ContentCompression = 0x5034, -// ContentCompAlgo = 0x4254, -// ContentCompSettings = 0x4255, -// ContentEncryption = 0x5035, -// ContentEncAlgo = 0x47e1, -// ContentEncKeyID = 0x47e2, -// ContentSignature = 0x47e3, -// ContentSigKeyID = 0x47e4, -// ContentSigAlgo = 0x47e5, -// ContentSigHashAlgo = 0x47e6, - // end content encoding - // Cueing Data + /* end audio */ + /* content encoding */ +/* ContentEncodings = 0x6d80, */ +/* ContentEncoding = 0x6240, */ +/* ContentEncodingOrder = 0x5031, */ +/* ContentEncodingScope = 0x5032, */ +/* ContentEncodingType = 0x5033, */ +/* ContentCompression = 0x5034, */ +/* ContentCompAlgo = 0x4254, */ +/* ContentCompSettings = 0x4255, */ +/* ContentEncryption = 0x5035, */ +/* ContentEncAlgo = 0x47e1, */ +/* ContentEncKeyID = 0x47e2, */ +/* ContentSignature = 0x47e3, */ +/* ContentSigKeyID = 0x47e4, */ +/* ContentSigAlgo = 0x47e5, */ +/* ContentSigHashAlgo = 0x47e6, */ + /* end content encoding */ + /* Cueing Data */ Cues = 0x1C53BB6B, CuePoint = 0xBB, CueTime = 0xB3, CueTrackPositions = 0xB7, CueTrack = 0xF7, CueClusterPosition = 0xF1, - CueBlockNumber = 0x5378, -// CueCodecState = 0xEA, -// CueReference = 0xDB, -// CueRefTime = 0x96, -// CueRefCluster = 0x97, -// CueRefNumber = 0x535F, -// CueRefCodecState = 0xEB, - // Attachment -// Attachments = 0x1941A469, -// AttachedFile = 0x61A7, -// FileDescription = 0x467E, -// FileName = 0x466E, -// FileMimeType = 0x4660, -// FileData = 0x465C, -// FileUID = 0x46AE, -// FileReferral = 0x4675, - // Chapters -// Chapters = 0x1043A770, -// EditionEntry = 0x45B9, -// EditionUID = 0x45BC, -// EditionFlagHidden = 0x45BD, -// EditionFlagDefault = 0x45DB, -// EditionFlagOrdered = 0x45DD, -// ChapterAtom = 0xB6, -// ChapterUID = 0x73C4, -// ChapterTimeStart = 0x91, -// ChapterTimeEnd = 0x92, -// ChapterFlagHidden = 0x98, -// ChapterFlagEnabled = 0x4598, -// ChapterSegmentUID = 0x6E67, -// ChapterSegmentEditionUID = 0x6EBC, -// ChapterPhysicalEquiv = 0x63C3, -// ChapterTrack = 0x8F, -// ChapterTrackNumber = 0x89, -// ChapterDisplay = 0x80, -// ChapString = 0x85, -// ChapLanguage = 0x437C, -// ChapCountry = 0x437E, -// ChapProcess = 0x6944, -// ChapProcessCodecID = 0x6955, -// ChapProcessPrivate = 0x450D, -// ChapProcessCommand = 0x6911, -// ChapProcessTime = 0x6922, -// ChapProcessData = 0x6933, - // Tagging -// Tags = 0x1254C367, -// Tag = 0x7373, -// Targets = 0x63C0, -// TargetTypeValue = 0x68CA, -// TargetType = 0x63CA, -// Tagging_TrackUID = 0x63C5, -// Tagging_EditionUID = 0x63C9, -// Tagging_ChapterUID = 0x63C4, -// AttachmentUID = 0x63C6, -// SimpleTag = 0x67C8, -// TagName = 0x45A3, -// TagLanguage = 0x447A, -// TagDefault = 0x4484, -// TagString = 0x4487, -// TagBinary = 0x4485, + CueBlockNumber = 0x5378 +/* CueCodecState = 0xEA, */ +/* CueReference = 0xDB, */ +/* CueRefTime = 0x96, */ +/* CueRefCluster = 0x97, */ +/* CueRefNumber = 0x535F, */ +/* CueRefCodecState = 0xEB, */ + /* Attachment */ +/* Attachments = 0x1941A469, */ +/* AttachedFile = 0x61A7, */ +/* FileDescription = 0x467E, */ +/* FileName = 0x466E, */ +/* FileMimeType = 0x4660, */ +/* FileData = 0x465C, */ +/* FileUID = 0x46AE, */ +/* FileReferral = 0x4675, */ + /* Chapters */ +/* Chapters = 0x1043A770, */ +/* EditionEntry = 0x45B9, */ +/* EditionUID = 0x45BC, */ +/* EditionFlagHidden = 0x45BD, */ +/* EditionFlagDefault = 0x45DB, */ +/* EditionFlagOrdered = 0x45DD, */ +/* ChapterAtom = 0xB6, */ +/* ChapterUID = 0x73C4, */ +/* ChapterTimeStart = 0x91, */ +/* ChapterTimeEnd = 0x92, */ +/* ChapterFlagHidden = 0x98, */ +/* ChapterFlagEnabled = 0x4598, */ +/* ChapterSegmentUID = 0x6E67, */ +/* ChapterSegmentEditionUID = 0x6EBC, */ +/* ChapterPhysicalEquiv = 0x63C3, */ +/* ChapterTrack = 0x8F, */ +/* ChapterTrackNumber = 0x89, */ +/* ChapterDisplay = 0x80, */ +/* ChapString = 0x85, */ +/* ChapLanguage = 0x437C, */ +/* ChapCountry = 0x437E, */ +/* ChapProcess = 0x6944, */ +/* ChapProcessCodecID = 0x6955, */ +/* ChapProcessPrivate = 0x450D, */ +/* ChapProcessCommand = 0x6911, */ +/* ChapProcessTime = 0x6922, */ +/* ChapProcessData = 0x6933, */ + /* Tagging */ +/* Tags = 0x1254C367, */ +/* Tag = 0x7373, */ +/* Targets = 0x63C0, */ +/* TargetTypeValue = 0x68CA, */ +/* TargetType = 0x63CA, */ +/* Tagging_TrackUID = 0x63C5, */ +/* Tagging_EditionUID = 0x63C9, */ +/* Tagging_ChapterUID = 0x63C4, */ +/* AttachmentUID = 0x63C6, */ +/* SimpleTag = 0x67C8, */ +/* TagName = 0x45A3, */ +/* TagLanguage = 0x447A, */ +/* TagDefault = 0x4484, */ +/* TagString = 0x4487, */ +/* TagBinary = 0x4485, */ }; #endif diff --git a/libmkv/EbmlWriter.c b/libmkv/EbmlWriter.c index 69039e1bf78905807880e9daed7c1e86170d1f84..5fc5ed2a3a6a6d62f5020b49b2be51a325a1c8f6 100644 --- a/libmkv/EbmlWriter.c +++ b/libmkv/EbmlWriter.c @@ -1,12 +1,12 @@ -// Copyright (c) 2010 The WebM project authors. All Rights Reserved. -// -// Use of this source code is governed by a BSD-style license -// that can be found in the LICENSE file in the root of the source -// tree. An additional intellectual property rights grant can be found -// in the file PATENTS. All contributing project authors may -// be found in the AUTHORS file in the root of the source tree. - - +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ #include "EbmlWriter.h" #include #include @@ -18,10 +18,12 @@ #define LITERALU64(n) n##LLU #endif -void Ebml_WriteLen(EbmlGlobal *glob, long long val) { - // TODO check and make sure we are not > than 0x0100000000000000LLU - unsigned char size = 8; // size in bytes to output - unsigned long long minVal = LITERALU64(0x00000000000000ff); // mask to compare for byte size +void Ebml_WriteLen(EbmlGlobal *glob, int64_t val) { + /* TODO check and make sure we are not > than 0x0100000000000000LLU */ + unsigned char size = 8; /* size in bytes to output */ + + /* mask to compare for byte size */ + int64_t minVal = 0xff; for (size = 1; size < 8; size ++) { if (val < minVal) @@ -30,29 +32,31 @@ void Ebml_WriteLen(EbmlGlobal *glob, long long val) { minVal = (minVal << 7); } - val |= (LITERALU64(0x000000000000080) << ((size - 1) * 7)); + val |= (((uint64_t)0x80) << ((size - 1) * 7)); Ebml_Serialize(glob, (void *) &val, sizeof(val), size); } void Ebml_WriteString(EbmlGlobal *glob, const char *str) { const size_t size_ = strlen(str); - const unsigned long long size = size_; + const uint64_t size = size_; Ebml_WriteLen(glob, size); - // TODO: it's not clear from the spec whether the nul terminator - // should be serialized too. For now we omit the null terminator. - Ebml_Write(glob, str, size); + /* TODO: it's not clear from the spec whether the nul terminator + * should be serialized too. For now we omit the null terminator. + */ + Ebml_Write(glob, str, (unsigned long)size); } void Ebml_WriteUTF8(EbmlGlobal *glob, const wchar_t *wstr) { const size_t strlen = wcslen(wstr); - // TODO: it's not clear from the spec whether the nul terminator - // should be serialized too. For now we include it. - const unsigned long long size = strlen; + /* TODO: it's not clear from the spec whether the nul terminator + * should be serialized too. For now we include it. + */ + const uint64_t size = strlen; Ebml_WriteLen(glob, size); - Ebml_Write(glob, wstr, size); + Ebml_Write(glob, wstr, (unsigned long)size); } void Ebml_WriteID(EbmlGlobal *glob, unsigned long class_id) { @@ -78,12 +82,12 @@ void Ebml_SerializeUnsigned64(EbmlGlobal *glob, unsigned long class_id, uint64_t } void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned long ui) { - unsigned char size = 8; // size in bytes to output + unsigned char size = 8; /* size in bytes to output */ unsigned char sizeSerialized = 0; unsigned long minVal; Ebml_WriteID(glob, class_id); - minVal = 0x7fLU; // mask to compare for byte size + minVal = 0x7fLU; /* mask to compare for byte size */ for (size = 1; size < 4; size ++) { if (ui < minVal) { @@ -97,7 +101,7 @@ void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned l Ebml_Serialize(glob, &sizeSerialized, sizeof(sizeSerialized), 1); Ebml_Serialize(glob, &ui, sizeof(ui), size); } -// TODO: perhaps this is a poor name for this id serializer helper function +/* TODO: perhaps this is a poor name for this id serializer helper function */ void Ebml_SerializeBinary(EbmlGlobal *glob, unsigned long class_id, unsigned long bin) { int size; for (size = 4; size > 1; size--) { @@ -150,4 +154,4 @@ void Ebml_WriteVoid(EbmlGlobal *glob, unsigned long vSize) { } } -// TODO Serialize Date +/* TODO Serialize Date */ diff --git a/libmkv/EbmlWriter.h b/libmkv/EbmlWriter.h index c4822ee1c0a2ee3326a48371eb1fc62082111977..b94f757332f0acd2900038b8835ca00e62f47fda 100644 --- a/libmkv/EbmlWriter.h +++ b/libmkv/EbmlWriter.h @@ -1,26 +1,30 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ #ifndef EBMLWRITER_HPP #define EBMLWRITER_HPP - -// Copyright (c) 2010 The WebM project authors. All Rights Reserved. -// -// Use of this source code is governed by a BSD-style license -// that can be found in the LICENSE file in the root of the source -// tree. An additional intellectual property rights grant can be found -// in the file PATENTS. All contributing project authors may -// be found in the AUTHORS file in the root of the source tree. - -// note: you must define write and serialize functions as well as your own EBML_GLOBAL -// These functions MUST be implemented #include #include "vpx/vpx_integer.h" +/* note: you must define write and serialize functions as well as your own + * EBML_GLOBAL + * + * These functions MUST be implemented + */ + typedef struct EbmlGlobal EbmlGlobal; void Ebml_Serialize(EbmlGlobal *glob, const void *, int, unsigned long); void Ebml_Write(EbmlGlobal *glob, const void *, unsigned long); -///// +/*****/ -void Ebml_WriteLen(EbmlGlobal *glob, long long val); +void Ebml_WriteLen(EbmlGlobal *glob, int64_t val); void Ebml_WriteString(EbmlGlobal *glob, const char *str); void Ebml_WriteUTF8(EbmlGlobal *glob, const wchar_t *wstr); void Ebml_WriteID(EbmlGlobal *glob, unsigned long class_id); @@ -28,11 +32,11 @@ void Ebml_SerializeUnsigned64(EbmlGlobal *glob, unsigned long class_id, uint64_t void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned long ui); void Ebml_SerializeBinary(EbmlGlobal *glob, unsigned long class_id, unsigned long ui); void Ebml_SerializeFloat(EbmlGlobal *glob, unsigned long class_id, double d); -// TODO make this more generic to signed +/* TODO make this more generic to signed */ void Ebml_WriteSigned16(EbmlGlobal *glob, short val); void Ebml_SerializeString(EbmlGlobal *glob, unsigned long class_id, const char *s); void Ebml_SerializeUTF8(EbmlGlobal *glob, unsigned long class_id, wchar_t *s); void Ebml_SerializeData(EbmlGlobal *glob, unsigned long class_id, unsigned char *data, unsigned long data_length); void Ebml_WriteVoid(EbmlGlobal *glob, unsigned long vSize); -// TODO need date function +/* TODO need date function */ #endif diff --git a/libs.mk b/libs.mk index abb7a8e3df144e3875871cc76fd19425bdef6b8b..9af6a35c75a9cdd82283e2d99623336ffcf2f8e8 100644 --- a/libs.mk +++ b/libs.mk @@ -17,6 +17,34 @@ else ASM:=.asm endif + +# +# Calculate platform- and compiler-specific offsets for hand coded assembly +# +ifeq ($(filter icc gcc,$(TGT_CC)), $(TGT_CC)) +OFFSET_PATTERN:='^[a-zA-Z0-9_]* EQU' +define asm_offsets_template +$$(BUILD_PFX)$(1): $$(BUILD_PFX)$(2).S + @echo " [CREATE] $$@" + $$(qexec)LC_ALL=C grep $$(OFFSET_PATTERN) $$< | tr -d '$$$$\#' $$(ADS2GAS) > $$@ +$$(BUILD_PFX)$(2).S: $(2) +CLEAN-OBJS += $$(BUILD_PFX)$(1) $(2).S +endef +else + ifeq ($(filter rvct,$(TGT_CC)), $(TGT_CC)) +define asm_offsets_template +$$(BUILD_PFX)$(1): obj_int_extract +$$(BUILD_PFX)$(1): $$(BUILD_PFX)$(2).o + @echo " [CREATE] $$@" + $$(qexec)./obj_int_extract rvds $$< $$(ADS2GAS) > $$@ +OBJS-yes += $$(BUILD_PFX)$(2).o +CLEAN-OBJS += $$(BUILD_PFX)$(1) +$$(filter %$$(ASM).o,$$(OBJS-yes)): $$(BUILD_PFX)$(1) +endef +endif # rvct +endif # !gcc + + CODEC_SRCS-yes += CHANGELOG CODEC_SRCS-yes += libs.mk @@ -29,15 +57,47 @@ CODEC_SRCS-yes += $(addprefix vpx_mem/,$(call enabled,MEM_SRCS)) include $(SRC_PATH_BARE)/vpx_scale/vpx_scale.mk CODEC_SRCS-yes += $(addprefix vpx_scale/,$(call enabled,SCALE_SRCS)) +ifneq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),) + VP8_PREFIX=vp8/ + include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8_common.mk +endif + +ifeq ($(CONFIG_VP8_ENCODER),yes) + include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8cx.mk + CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_CX_SRCS)) + CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_CX_EXPORTS)) + CODEC_SRCS-yes += $(VP8_PREFIX)vp8cx.mk vpx/vp8.h vpx/vp8cx.h + CODEC_SRCS-$(ARCH_ARM) += $(VP8_PREFIX)vp88cx_arm.mk + INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h + INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/% + CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h + CODEC_DOC_SECTIONS += vp8 vp8_encoder +endif + +ifeq ($(CONFIG_VP8_DECODER),yes) + include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8dx.mk + CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_DX_SRCS)) + CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_DX_EXPORTS)) + CODEC_SRCS-yes += $(VP8_PREFIX)vp8dx.mk vpx/vp8.h vpx/vp8dx.h + INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8dx.h + INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/% + CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8dx.h + CODEC_DOC_SECTIONS += vp8 vp8_decoder +endif + +ifneq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_DECODER),) + VP9_PREFIX=vp9/ + include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9_common.mk +endif ifeq ($(CONFIG_VP9_ENCODER),yes) VP9_PREFIX=vp9/ include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9cx.mk CODEC_SRCS-yes += $(addprefix $(VP9_PREFIX),$(call enabled,VP9_CX_SRCS)) CODEC_EXPORTS-yes += $(addprefix $(VP9_PREFIX),$(VP9_CX_EXPORTS)) - CODEC_SRCS-yes += $(VP9_PREFIX)vp9cx.mk vpx/vp8.h vpx/vp8cx.h vpx/vp8e.h + CODEC_SRCS-yes += $(VP9_PREFIX)vp9cx.mk vpx/vp8.h vpx/vp8cx.h CODEC_SRCS-$(ARCH_ARM) += $(VP9_PREFIX)vp98cx_arm.mk - INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8e.h include/vpx/vp8cx.h + INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/% CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h CODEC_DOC_SECTIONS += vp9 vp9_encoder @@ -117,7 +177,6 @@ INSTALL-LIBS-yes += include/vpx/vpx_integer.h INSTALL-LIBS-yes += include/vpx/vpx_codec_impl_top.h INSTALL-LIBS-yes += include/vpx/vpx_codec_impl_bottom.h INSTALL-LIBS-$(CONFIG_DECODERS) += include/vpx/vpx_decoder.h -INSTALL-LIBS-$(CONFIG_DECODERS) += include/vpx/vpx_decoder_compat.h INSTALL-LIBS-$(CONFIG_ENCODERS) += include/vpx/vpx_encoder.h ifeq ($(CONFIG_EXTERNAL_BUILD),yes) ifeq ($(CONFIG_MSVS),yes) @@ -149,7 +208,7 @@ ifeq ($(CONFIG_MSVS),yes) obj_int_extract.vcproj: $(SRC_PATH_BARE)/build/make/obj_int_extract.c @cp $(SRC_PATH_BARE)/build/x86-msvs/obj_int_extract.bat . @echo " [CREATE] $@" - $(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \ + $(qexec)$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \ --exe \ --target=$(TOOLCHAIN) \ --name=obj_int_extract \ @@ -165,14 +224,14 @@ PROJECTS-$(BUILD_LIBVPX) += obj_int_extract.bat vpx.def: $(call enabled,CODEC_EXPORTS) @echo " [CREATE] $@" - $(SRC_PATH_BARE)/build/make/gen_msvs_def.sh\ + $(qexec)$(SRC_PATH_BARE)/build/make/gen_msvs_def.sh\ --name=vpx\ --out=$@ $^ CLEAN-OBJS += vpx.def vpx.vcproj: $(CODEC_SRCS) vpx.def @echo " [CREATE] $@" - $(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \ + $(qexec)$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \ $(if $(CONFIG_SHARED),--dll,--lib) \ --target=$(TOOLCHAIN) \ $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \ @@ -264,6 +323,7 @@ vpx.pc: config.mk libs.mk $(qexec)echo 'Requires:' >> $@ $(qexec)echo 'Conflicts:' >> $@ $(qexec)echo 'Libs: -L$${libdir} -lvpx -lm' >> $@ + $(qexec)echo 'Libs.private: -lm -lpthread' >> $@ $(qexec)echo 'Cflags: -I$${includedir}' >> $@ INSTALL-LIBS-yes += $(LIBSUBDIR)/pkgconfig/vpx.pc INSTALL_MAPS += $(LIBSUBDIR)/pkgconfig/%.pc %.pc @@ -298,57 +358,6 @@ endif $(filter %.s.o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm -# -# Calculate platform- and compiler-specific offsets for hand coded assembly -# - -OFFSET_PATTERN:='^[a-zA-Z0-9_]* EQU' - -ifeq ($(filter icc gcc,$(TGT_CC)), $(TGT_CC)) - $(BUILD_PFX)asm_com_offsets.asm: $(BUILD_PFX)$(VP9_PREFIX)common/asm_com_offsets.c.S - @echo " [CREATE] $@" - $(qexec)LC_ALL=C grep $(OFFSET_PATTERN) $< | tr -d '$$\#' $(ADS2GAS) > $@ - $(BUILD_PFX)$(VP9_PREFIX)common/asm_com_offsets.c.S: $(VP9_PREFIX)common/asm_com_offsets.c - CLEAN-OBJS += $(BUILD_PFX)asm_com_offsets.asm $(BUILD_PFX)$(VP9_PREFIX)common/asm_com_offsets.c.S - - $(BUILD_PFX)asm_enc_offsets.asm: $(BUILD_PFX)$(VP9_PREFIX)encoder/asm_enc_offsets.c.S - @echo " [CREATE] $@" - $(qexec)LC_ALL=C grep $(OFFSET_PATTERN) $< | tr -d '$$\#' $(ADS2GAS) > $@ - $(BUILD_PFX)$(VP9_PREFIX)encoder/asm_enc_offsets.c.S: $(VP9_PREFIX)encoder/asm_enc_offsets.c - CLEAN-OBJS += $(BUILD_PFX)asm_enc_offsets.asm $(BUILD_PFX)$(VP9_PREFIX)encoder/asm_enc_offsets.c.S - - $(BUILD_PFX)asm_dec_offsets.asm: $(BUILD_PFX)$(VP9_PREFIX)decoder/asm_dec_offsets.c.S - @echo " [CREATE] $@" - $(qexec)LC_ALL=C grep $(OFFSET_PATTERN) $< | tr -d '$$\#' $(ADS2GAS) > $@ - $(BUILD_PFX)$(VP9_PREFIX)decoder/asm_dec_offsets.c.S: $(VP9_PREFIX)decoder/asm_dec_offsets.c - CLEAN-OBJS += $(BUILD_PFX)asm_dec_offsets.asm $(BUILD_PFX)$(VP9_PREFIX)decoder/asm_dec_offsets.c.S -else - ifeq ($(filter rvct,$(TGT_CC)), $(TGT_CC)) - asm_com_offsets.asm: obj_int_extract - asm_com_offsets.asm: $(VP9_PREFIX)common/asm_com_offsets.c.o - @echo " [CREATE] $@" - $(qexec)./obj_int_extract rvds $< $(ADS2GAS) > $@ - OBJS-yes += $(VP9_PREFIX)common/asm_com_offsets.c.o - CLEAN-OBJS += asm_com_offsets.asm - $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_com_offsets.asm - - asm_enc_offsets.asm: obj_int_extract - asm_enc_offsets.asm: $(VP9_PREFIX)encoder/asm_enc_offsets.c.o - @echo " [CREATE] $@" - $(qexec)./obj_int_extract rvds $< $(ADS2GAS) > $@ - OBJS-yes += $(VP9_PREFIX)encoder/asm_enc_offsets.c.o - CLEAN-OBJS += asm_enc_offsets.asm - $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_enc_offsets.asm - - asm_dec_offsets.asm: obj_int_extract - asm_dec_offsets.asm: $(VP9_PREFIX)decoder/asm_dec_offsets.c.o - @echo " [CREATE] $@" - $(qexec)./obj_int_extract rvds $< $(ADS2GAS) > $@ - OBJS-yes += $(VP9_PREFIX)decoder/asm_dec_offsets.c.o - CLEAN-OBJS += asm_dec_offsets.asm - $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_dec_offsets.asm - endif -endif $(shell $(SRC_PATH_BARE)/build/make/version.sh "$(SRC_PATH_BARE)" $(BUILD_PFX)vpx_version.h) CLEAN-OBJS += $(BUILD_PFX)vpx_version.h @@ -356,15 +365,15 @@ CLEAN-OBJS += $(BUILD_PFX)vpx_version.h # # Rule to generate runtime cpu detection files # -$(OBJS-yes:.o=.d): vpx_rtcd.h -vpx_rtcd.h: $(sort $(filter %rtcd_defs.sh,$(CODEC_SRCS))) +$(BUILD_PFX)vpx_rtcd.h: $(SRC_PATH_BARE)/$(sort $(filter %rtcd_defs.sh,$(CODEC_SRCS))) @echo " [CREATE] $@" $(qexec)$(SRC_PATH_BARE)/build/make/rtcd.sh --arch=$(TGT_ISA) \ - --sym=vpx_rtcd \ - --config=$(target)$(if $(FAT_ARCHS),,-$(TOOLCHAIN)).mk \ - $(RTCD_OPTIONS) $^ > $@ + --sym=vpx_rtcd \ + --config=$(target)$(if $(FAT_ARCHS),,-$(TOOLCHAIN)).mk \ + $(RTCD_OPTIONS) $^ > $@ CLEAN-OBJS += $(BUILD_PFX)vpx_rtcd.h + CODEC_DOC_SRCS += vpx/vpx_codec.h \ vpx/vpx_decoder.h \ vpx/vpx_encoder.h \ @@ -373,7 +382,6 @@ CODEC_DOC_SRCS += vpx/vpx_codec.h \ ## ## libvpx test directives ## - ifeq ($(CONFIG_UNIT_TESTS),yes) LIBVPX_TEST_DATA_PATH ?= . @@ -392,8 +400,12 @@ $(LIBVPX_TEST_DATA): testdata:: $(LIBVPX_TEST_DATA) $(qexec)if [ -x "$$(which sha1sum)" ]; then\ echo "Checking test data:";\ - (cd $(LIBVPX_TEST_DATA_PATH); sha1sum -c)\ - < $(SRC_PATH_BARE)/test/test-data.sha1; \ + if [ -n "$(LIBVPX_TEST_DATA)" ]; then\ + for f in $(call enabled,LIBVPX_TEST_DATA); do\ + grep $$f $(SRC_PATH_BARE)/test/test-data.sha1 |\ + (cd $(LIBVPX_TEST_DATA_PATH); sha1sum -c);\ + done; \ + fi; \ else\ echo "Skipping test data integrity check, sha1sum not found.";\ fi @@ -403,7 +415,7 @@ ifeq ($(CONFIG_MSVS),yes) gtest.vcproj: $(SRC_PATH_BARE)/third_party/googletest/src/src/gtest-all.cc @echo " [CREATE] $@" - $(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \ + $(qexec)$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \ --lib \ --target=$(TOOLCHAIN) \ $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \ @@ -418,7 +430,7 @@ PROJECTS-$(CONFIG_MSVS) += gtest.vcproj test_libvpx.vcproj: $(LIBVPX_TEST_SRCS) @echo " [CREATE] $@" - $(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \ + $(qexec)$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \ --exe \ --target=$(TOOLCHAIN) \ --name=test_libvpx \ @@ -428,28 +440,6 @@ test_libvpx.vcproj: $(LIBVPX_TEST_SRCS) --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \ -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \ -L. -l$(CODEC_LIB) -lwinmm -l$(GTEST_LIB) $^ -ifeq ($(CONFIG_STATIC_MSVCRT),--static-crt) -lib_sfx=mt -else -lib_sfx=md -endif - -define unit_test_vcproj_template -$(notdir $(1:.cc=.vcproj)): $(SRC_PATH_BARE)/$(1) - @echo " [vcproj] $$@" - $$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh\ - --exe\ - --target=$$(TOOLCHAIN)\ - --name=$(notdir $(1:.cc=))\ - --ver=$$(CONFIG_VS_VERSION)\ - $$(if $$(CONFIG_STATIC_MSVCRT),--static-crt) \ - --out=$$@ $$(INTERNAL_CFLAGS) $$(CFLAGS) \ - -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \ - -L. -lvpxmd -lwinmm -lgtest$(lib_sfx) $$^ -endef - -$(foreach proj,$(LIBVPX_TEST_BINS),\ - $(eval $(call unit_test_vcproj_template,$(proj)))) PROJECTS-$(CONFIG_MSVS) += test_libvpx.vcproj @@ -461,24 +451,28 @@ else include $(SRC_PATH_BARE)/third_party/googletest/gtest.mk GTEST_SRCS := $(addprefix third_party/googletest/src/,$(call enabled,GTEST_SRCS)) GTEST_OBJS=$(call objs,$(GTEST_SRCS)) -$(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src -$(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src/include +$(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src +$(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src/include OBJS-$(BUILD_LIBVPX) += $(GTEST_OBJS) LIBS-$(BUILD_LIBVPX) += $(BUILD_PFX)libgtest.a $(BUILD_PFX)libgtest_g.a $(BUILD_PFX)libgtest_g.a: $(GTEST_OBJS) LIBVPX_TEST_OBJS=$(sort $(call objs,$(LIBVPX_TEST_SRCS))) -$(LIBVPX_TEST_OBJS) $(LIBVPX_TEST_OBJS:.o=.d): CFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src -$(LIBVPX_TEST_OBJS) $(LIBVPX_TEST_OBJS:.o=.d): CFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src/include +$(LIBVPX_TEST_OBJS) $(LIBVPX_TEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src +$(LIBVPX_TEST_OBJS) $(LIBVPX_TEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src/include OBJS-$(BUILD_LIBVPX) += $(LIBVPX_TEST_OBJS) +BINS-$(BUILD_LIBVPX) += $(LIBVPX_TEST_BINS) # Install test sources only if codec source is included INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(patsubst $(SRC_PATH_BARE)/%,%,\ $(shell find $(SRC_PATH_BARE)/third_party/googletest -type f)) INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(LIBVPX_TEST_SRCS) +CODEC_LIB=$(if $(CONFIG_DEBUG_LIBS),vpx_g,vpx) +CODEC_LIB_SUF=$(if $(CONFIG_SHARED),.so,.a) $(foreach bin,$(LIBVPX_TEST_BINS),\ - $(if $(BUILD_LIBVPX),$(eval $(bin): libvpx.a libgtest.a ))\ + $(if $(BUILD_LIBVPX),$(eval $(bin): \ + lib$(CODEC_LIB)$(CODEC_LIB_SUF) libgtest.a ))\ $(if $(BUILD_LIBVPX),$(eval $(call linkerxx_template,$(bin),\ $(LIBVPX_TEST_OBJS) \ -L. -lvpx -lgtest -lpthread -lm)\ @@ -503,3 +497,6 @@ libs.doxy: $(CODEC_DOC_SRCS) @echo "PREDEFINED = VPX_CODEC_DISABLE_COMPAT" >> $@ @echo "INCLUDE_PATH += ." >> $@; @echo "ENABLED_SECTIONS += $(sort $(CODEC_DOC_SECTIONS))" >> $@ + +## Generate vpx_rtcd.h for all objects +$(OBJS-yes:.o=.d): $(BUILD_PFX)vpx_rtcd.h diff --git a/mainpage.dox b/mainpage.dox index 5613ae7717ae206749d3e72c84d559eb4adac448..e2ec280027e024e3b706c148bbac2212172c6d3a 100644 --- a/mainpage.dox +++ b/mainpage.dox @@ -12,8 +12,12 @@ This distribution of the WebM VP8 Codec SDK includes the following support: - \if vp8_encoder - \ref vp8_encoder \endif - \if vp8_decoder - \ref vp8_decoder \endif + \if vp8_encoder + - \ref vp8_encoder + \endif + \if vp8_decoder + - \ref vp8_decoder + \endif \section main_startpoints Starting Points @@ -24,8 +28,12 @@ - Read the \ref samples "sample code" for examples of how to interact with the codec. - \ref codec reference - \if encoder - \ref encoder reference \endif - \if decoder - \ref decoder reference \endif + \if encoder + - \ref encoder reference + \endif + \if decoder + - \ref decoder reference + \endif \section main_support Support Options & FAQ The WebM project is an open source project supported by its community. For diff --git a/nestegg/src/nestegg.c b/nestegg/src/nestegg.c index 63a0e83e55b6c69de1801111a77735b3e608af06..cc87788fea4818f6591baead4b6ea47c3c24f1c2 100644 --- a/nestegg/src/nestegg.c +++ b/nestegg/src/nestegg.c @@ -1272,7 +1272,7 @@ ne_read_block(nestegg * ctx, uint64_t block_id, uint64_t block_size, nestegg_pac if (total > block_size) return -1; - entry = ne_find_track_entry(ctx, track - 1); + entry = ne_find_track_entry(ctx, (unsigned int)(track - 1)); if (!entry) return -1; @@ -1291,7 +1291,7 @@ ne_read_block(nestegg * ctx, uint64_t block_id, uint64_t block_size, nestegg_pac pkt = ne_alloc(sizeof(*pkt)); pkt->track = track - 1; - pkt->timecode = abs_timecode * tc_scale * track_scale; + pkt->timecode = (uint64_t)(abs_timecode * tc_scale * track_scale); ctx->log(ctx, NESTEGG_LOG_DEBUG, "%sblock t %lld pts %f f %llx frames: %llu", block_id == ID_BLOCK ? "" : "simple", pkt->track, pkt->timecode / 1e9, flags, frames); @@ -1774,35 +1774,35 @@ nestegg_track_video_params(nestegg * ctx, unsigned int track, if (ne_get_uint(entry->video.pixel_width, &value) != 0) return -1; - params->width = value; + params->width = (unsigned int)value; if (ne_get_uint(entry->video.pixel_height, &value) != 0) return -1; - params->height = value; + params->height = (unsigned int)value; value = 0; ne_get_uint(entry->video.pixel_crop_bottom, &value); - params->crop_bottom = value; + params->crop_bottom = (unsigned int)value; value = 0; ne_get_uint(entry->video.pixel_crop_top, &value); - params->crop_top = value; + params->crop_top = (unsigned int)value; value = 0; ne_get_uint(entry->video.pixel_crop_left, &value); - params->crop_left = value; + params->crop_left = (unsigned int)value; value = 0; ne_get_uint(entry->video.pixel_crop_right, &value); - params->crop_right = value; + params->crop_right = (unsigned int)value; value = params->width; ne_get_uint(entry->video.display_width, &value); - params->display_width = value; + params->display_width = (unsigned int)value; value = params->height; ne_get_uint(entry->video.display_height, &value); - params->display_height = value; + params->display_height = (unsigned int)value; return 0; } @@ -1828,11 +1828,11 @@ nestegg_track_audio_params(nestegg * ctx, unsigned int track, value = 1; ne_get_uint(entry->audio.channels, &value); - params->channels = value; + params->channels = (unsigned int)value; value = 16; ne_get_uint(entry->audio.bit_depth, &value); - params->depth = value; + params->depth = (unsigned int)value; return 0; } @@ -1888,7 +1888,7 @@ nestegg_free_packet(nestegg_packet * pkt) int nestegg_packet_track(nestegg_packet * pkt, unsigned int * track) { - *track = pkt->track; + *track = (unsigned int)pkt->track; return 0; } diff --git a/solution.mk b/solution.mk index 2de1d8d3e292b6532319c9b27c5e8fdf3c848a69..948305f0594384f0202e40e7c977799c58d0a6a1 100644 --- a/solution.mk +++ b/solution.mk @@ -8,18 +8,19 @@ ## be found in the AUTHORS file in the root of the source tree. ## +# libvpx reverse dependencies (targets that depend on libvpx) +VPX_NONDEPS=$(addsuffix .vcproj,vpx gtest obj_int_extract) +VPX_RDEPS=$(foreach vcp,\ + $(filter-out $(VPX_NONDEPS),$^), --dep=$(vcp:.vcproj=):vpx) vpx.sln: $(wildcard *.vcproj) @echo " [CREATE] $@" $(SRC_PATH_BARE)/build/make/gen_msvs_sln.sh \ - $(if $(filter %vpx.vcproj,$^),\ - $(foreach vcp,$(filter-out %vpx.vcproj %gtest.vcproj %obj_int_extract.vcproj,$^),\ - --dep=$(vcp:.vcproj=):vpx) \ - $(foreach vcp,$(filter %_test.vcproj,$^),\ - --dep=$(vcp:.vcproj=):gtest)) \ - --dep=vpx:obj_int_extract \ - --ver=$(CONFIG_VS_VERSION)\ - --out=$@ $^ + $(if $(filter vpx.vcproj,$^),$(VPX_RDEPS)) \ + --dep=vpx:obj_int_extract \ + --dep=test_libvpx:gtest \ + --ver=$(CONFIG_VS_VERSION)\ + --out=$@ $^ vpx.sln.mk: vpx.sln @true diff --git a/test/acm_random.h b/test/acm_random.h index dcd1bba8c673036893e45061aff495cc2492fbb8..514894edaf7758289c42925a28d09f85e92b10aa 100644 --- a/test/acm_random.h +++ b/test/acm_random.h @@ -19,6 +19,10 @@ namespace libvpx_test { class ACMRandom { public: + ACMRandom() { + Reset(DeterministicSeed()); + } + explicit ACMRandom(int seed) { Reset(seed); } diff --git a/test/altref_test.cc b/test/altref_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..ca055773df70ba7b416171df435dd238199934e1 --- /dev/null +++ b/test/altref_test.cc @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" + +namespace { + +// lookahead range: [kLookAheadMin, kLookAheadMax). +const int kLookAheadMin = 5; +const int kLookAheadMax = 26; + +class AltRefTest : public libvpx_test::EncoderTest, + public ::testing::TestWithParam { + protected: + AltRefTest() : altref_count_(0) {} + virtual ~AltRefTest() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(libvpx_test::kTwoPassGood); + } + + virtual void BeginPassHook(unsigned int pass) { + altref_count_ = 0; + } + + virtual bool Continue() const { + return !HasFatalFailure() && !abort_; + } + + virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) { + if (video->frame() == 1) { + encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); + encoder->Control(VP8E_SET_CPUUSED, 3); + } + } + + virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + if (pkt->data.frame.flags & VPX_FRAME_IS_INVISIBLE) ++altref_count_; + } + + int altref_count() const { return altref_count_; } + + private: + int altref_count_; +}; + +TEST_P(AltRefTest, MonotonicTimestamps) { + const vpx_rational timebase = { 33333333, 1000000000 }; + cfg_.g_timebase = timebase; + cfg_.rc_target_bitrate = 1000; + cfg_.g_lag_in_frames = GetParam(); + + libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + timebase.den, timebase.num, 0, 30); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + EXPECT_GE(altref_count(), 1); +} + +INSTANTIATE_TEST_CASE_P(NonZeroLag, AltRefTest, + ::testing::Range(kLookAheadMin, kLookAheadMax)); +} // namespace diff --git a/test/boolcoder_test.cc b/test/boolcoder_test.cc index 119c77a790db6a1ea102c35b6889bf860bd75a7e..4e21be8c53c78dc623bd5f2ec67352548ff476f3 100644 --- a/test/boolcoder_test.cc +++ b/test/boolcoder_test.cc @@ -8,26 +8,28 @@ * be found in the AUTHORS file in the root of the source tree. */ +extern "C" { +#include "vp8/encoder/boolhuff.h" +#include "vp8/decoder/dboolhuff.h" +} + #include +#include +#include #include #include +#include +#include "test/acm_random.h" #include "third_party/googletest/src/include/gtest/gtest.h" - -extern "C" { -#include "vp9/encoder/boolhuff.h" -#include "vp9/decoder/dboolhuff.h" -} - -#include "acm_random.h" #include "vpx/vpx_integer.h" -using libvpx_test::ACMRandom; - namespace { const int num_tests = 10; } // namespace +using libvpx_test::ACMRandom; + TEST(VP8, TestBitIO) { ACMRandom rnd(ACMRandom::DeterministicSeed()); for (int n = 0; n < num_tests; ++n) { @@ -38,15 +40,15 @@ TEST(VP8, TestBitIO) { for (int i = 0; i < bits_to_test; ++i) { const int parity = i & 1; probas[i] = - (method == 0) ? 0 : (method == 1) ? 255 : - (method == 2) ? 128 : - (method == 3) ? rnd.Rand8() : - (method == 4) ? (parity ? 0 : 255) : + (method == 0) ? 0 : (method == 1) ? 255 : + (method == 2) ? 128 : + (method == 3) ? rnd.Rand8() : + (method == 4) ? (parity ? 0 : 255) : // alternate between low and high proba: (method == 5) ? (parity ? rnd(128) : 255 - rnd(128)) : (method == 6) ? - (parity ? rnd(64) : 255 - rnd(64)) : - (parity ? rnd(32) : 255 - rnd(32)); + (parity ? rnd(64) : 255 - rnd(64)) : + (parity ? rnd(32) : 255 - rnd(32)); } for (int bit_method = 0; bit_method <= 3; ++bit_method) { const int random_seed = 6432; @@ -54,7 +56,7 @@ TEST(VP8, TestBitIO) { ACMRandom bit_rnd(random_seed); BOOL_CODER bw; uint8_t bw_buffer[buffer_size]; - vp8_start_encode(&bw, bw_buffer); + vp8_start_encode(&bw, bw_buffer, bw_buffer + buffer_size); int bit = (bit_method == 0) ? 0 : (bit_method == 1) ? 1 : 0; for (int i = 0; i < bits_to_test; ++i) { @@ -78,7 +80,7 @@ TEST(VP8, TestBitIO) { bit = bit_rnd(2); } GTEST_ASSERT_EQ(vp8dx_decode_bool(&br, probas[i]), bit) - << "pos: " << i << " / " << bits_to_test + << "pos: "<< i << " / " << bits_to_test << " bit_method: " << bit_method << " method: " << method; } diff --git a/test/config_test.cc b/test/config_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..c4da46e2e96218146bb6bed9dbf7075b9de3e0f8 --- /dev/null +++ b/test/config_test.cc @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/encode_test_driver.h" +#include "test/video_source.h" + +namespace { + +class ConfigTest : public ::libvpx_test::EncoderTest, + public ::testing::TestWithParam { + public: + ConfigTest() : frame_count_in_(0), frame_count_out_(0), frame_count_max_(0) {} + + protected: + virtual void SetUp() { + InitializeConfig(); + SetMode(GetParam()); + } + + virtual void BeginPassHook(unsigned int /*pass*/) { + frame_count_in_ = 0; + frame_count_out_ = 0; + } + + virtual void PreEncodeFrameHook(libvpx_test::VideoSource* /*video*/) { + ++frame_count_in_; + abort_ |= (frame_count_in_ >= frame_count_max_); + } + + virtual void FramePktHook(const vpx_codec_cx_pkt_t* /*pkt*/) { + ++frame_count_out_; + } + + virtual bool Continue() const { + return !HasFatalFailure() && !abort_; + } + + unsigned int frame_count_in_; + unsigned int frame_count_out_; + unsigned int frame_count_max_; +}; + +TEST_P(ConfigTest, LagIsDisabled) { + frame_count_max_ = 2; + cfg_.g_lag_in_frames = 15; + + libvpx_test::DummyVideoSource video; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + EXPECT_EQ(frame_count_in_, frame_count_out_); +} + +INSTANTIATE_TEST_CASE_P(OnePassModes, ConfigTest, ONE_PASS_TEST_MODES); +} // namespace diff --git a/test/cq_test.cc b/test/cq_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..42ee2a2f83a14672a6a2194158aaaf49a9ad240d --- /dev/null +++ b/test/cq_test.cc @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" + +// CQ level range: [kCQLevelMin, kCQLevelMax). +const int kCQLevelMin = 4; +const int kCQLevelMax = 63; +const int kCQLevelStep = 8; +const int kCQTargetBitrate = 2000; + +namespace { + +class CQTest : public libvpx_test::EncoderTest, + public ::testing::TestWithParam { + protected: + CQTest() : cq_level_(GetParam()) { init_flags_ = VPX_CODEC_USE_PSNR; } + virtual ~CQTest() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(libvpx_test::kTwoPassGood); + } + + virtual void BeginPassHook(unsigned int /*pass*/) { + file_size_ = 0; + psnr_ = 0.0; + n_frames_ = 0; + } + + virtual bool Continue() const { + return !HasFatalFailure() && !abort_; + } + + virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) { + if (video->frame() == 1) { + if (cfg_.rc_end_usage == VPX_CQ) { + encoder->Control(VP8E_SET_CQ_LEVEL, cq_level_); + } + encoder->Control(VP8E_SET_CPUUSED, 3); + } + } + + virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { + psnr_ += pow(10.0, pkt->data.psnr.psnr[0] / 10.0); + n_frames_++; + } + + virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + file_size_ += pkt->data.frame.sz; + } + + double GetLinearPSNROverBitrate() const { + double avg_psnr = log10(psnr_ / n_frames_) * 10.0; + return pow(10.0, avg_psnr / 10.0) / file_size_; + } + + int file_size() const { return file_size_; } + int n_frames() const { return n_frames_; } + + private: + int cq_level_; + int file_size_; + double psnr_; + int n_frames_; +}; + +int prev_actual_bitrate = kCQTargetBitrate; +TEST_P(CQTest, LinearPSNRIsHigherForCQLevel) { + const vpx_rational timebase = { 33333333, 1000000000 }; + cfg_.g_timebase = timebase; + cfg_.rc_target_bitrate = kCQTargetBitrate; + cfg_.g_lag_in_frames = 25; + + cfg_.rc_end_usage = VPX_CQ; + libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + timebase.den, timebase.num, 0, 30); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + const double cq_psnr_lin = GetLinearPSNROverBitrate(); + const int cq_actual_bitrate = file_size() * 8 * 30 / (n_frames() * 1000); + EXPECT_LE(cq_actual_bitrate, kCQTargetBitrate); + EXPECT_LE(cq_actual_bitrate, prev_actual_bitrate); + prev_actual_bitrate = cq_actual_bitrate; + + // try targeting the approximate same bitrate with VBR mode + cfg_.rc_end_usage = VPX_VBR; + cfg_.rc_target_bitrate = cq_actual_bitrate; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + const double vbr_psnr_lin = GetLinearPSNROverBitrate(); + EXPECT_GE(cq_psnr_lin, vbr_psnr_lin); +} + +INSTANTIATE_TEST_CASE_P(CQLevelRange, CQTest, + ::testing::Range(kCQLevelMin, kCQLevelMax, + kCQLevelStep)); +} // namespace diff --git a/test/datarate_test.cc b/test/datarate_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..f2a2031fdbca54b7ed7dbc5b47c198a81dc2e101 --- /dev/null +++ b/test/datarate_test.cc @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "third_party/googletest/src/include/gtest/gtest.h" +namespace { + +class DatarateTest : public ::libvpx_test::EncoderTest, + public ::testing::TestWithParam { + protected: + virtual void SetUp() { + InitializeConfig(); + SetMode(GetParam()); + ResetModel(); + } + + virtual void ResetModel() { + last_pts_ = 0; + bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz; + frame_number_ = 0; + first_drop_ = 0; + bits_total_ = 0; + duration_ = 0.0; + } + + virtual bool Continue() const { + return !HasFatalFailure() && !abort_; + } + + virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) { + const vpx_rational_t tb = video->timebase(); + timebase_ = static_cast(tb.num) / tb.den; + duration_ = 0; + } + + virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + // Time since last timestamp = duration. + vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_; + + // TODO(jimbankoski): Remove these lines when the issue: + // http://code.google.com/p/webm/issues/detail?id=496 is fixed. + // For now the codec assumes buffer starts at starting buffer rate + // plus one frame's time. + if (last_pts_ == 0) + duration = 1; + + // Add to the buffer the bits we'd expect from a constant bitrate server. + bits_in_buffer_model_ += duration * timebase_ * cfg_.rc_target_bitrate + * 1000; + + /* Test the buffer model here before subtracting the frame. Do so because + * the way the leaky bucket model works in libvpx is to allow the buffer to + * empty - and then stop showing frames until we've got enough bits to + * show one. */ + ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame " + << pkt->data.frame.pts; + + const int frame_size_in_bits = pkt->data.frame.sz * 8; + + // Subtract from the buffer the bits associated with a played back frame. + bits_in_buffer_model_ -= frame_size_in_bits; + + // Update the running total of bits for end of test datarate checks. + bits_total_ += frame_size_in_bits ; + + // If first drop not set and we have a drop set it to this time. + if (!first_drop_ && duration > 1) + first_drop_ = last_pts_ + 1; + + // Update the most recent pts. + last_pts_ = pkt->data.frame.pts; + + // We update this so that we can calculate the datarate minus the last + // frame encoded in the file. + bits_in_last_frame_ = frame_size_in_bits; + + ++frame_number_; + } + + virtual void EndPassHook(void) { + if (bits_total_) { + const double file_size_in_kb = bits_total_ / 1000; /* bits per kilobit */ + + duration_ = (last_pts_ + 1) * timebase_; + + // Effective file datarate includes the time spent prebuffering. + effective_datarate_ = (bits_total_ - bits_in_last_frame_) / 1000.0 + / (cfg_.rc_buf_initial_sz / 1000.0 + duration_); + + file_datarate_ = file_size_in_kb / duration_; + } + } + + vpx_codec_pts_t last_pts_; + int bits_in_buffer_model_; + double timebase_; + int frame_number_; + vpx_codec_pts_t first_drop_; + int64_t bits_total_; + double duration_; + double file_datarate_; + double effective_datarate_; + int bits_in_last_frame_; +}; + +TEST_P(DatarateTest, BasicBufferModel) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + // 2 pass cbr datarate control has a bug hidden by the small # of + // frames selected in this encode. The problem is that even if the buffer is + // negative we produce a keyframe on a cutscene. Ignoring datarate + // constraints + // TODO(jimbankoski): ( Fix when issue + // http://code.google.com/p/webm/issues/detail?id=495 is addressed. ) + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 140); + + for (int i = 70; i < 700; i += 200) { + cfg_.rc_target_bitrate = i; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_) + << " The datarate for the file exceeds the target!"; + + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.3) + << " The datarate for the file missed the target!"; + } +} + +TEST_P(DatarateTest, ChangingDropFrameThresh) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_max_quantizer = 36; + cfg_.rc_end_usage = VPX_CBR; + cfg_.rc_target_bitrate = 200; + cfg_.kf_mode = VPX_KF_DISABLED; + + const int frame_count = 40; + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, frame_count); + + // Here we check that the first dropped frame gets earlier and earlier + // as the drop frame threshold is increased. + + const int kDropFrameThreshTestStep = 30; + vpx_codec_pts_t last_drop = frame_count; + for (int i = 1; i < 91; i += kDropFrameThreshTestStep) { + cfg_.rc_dropframe_thresh = i; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_LE(first_drop_, last_drop) + << " The first dropped frame for drop_thresh " << i + << " > first dropped frame for drop_thresh " + << i - kDropFrameThreshTestStep; + last_drop = first_drop_; + } +} + +INSTANTIATE_TEST_CASE_P(AllModes, DatarateTest, ALL_TEST_MODES); +} // namespace diff --git a/test/decode_test_driver.cc b/test/decode_test_driver.cc new file mode 100644 index 0000000000000000000000000000000000000000..3610f025d8994853a9fe5756ce83bdad1ee20449 --- /dev/null +++ b/test/decode_test_driver.cc @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "test/decode_test_driver.h" +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/video_source.h" + +namespace libvpx_test { +#if CONFIG_VP8_DECODER +void Decoder::DecodeFrame(const uint8_t *cxdata, int size) { + if (!decoder_.priv) { + const vpx_codec_err_t res_init = vpx_codec_dec_init(&decoder_, + &vpx_codec_vp8_dx_algo, + &cfg_, 0); + ASSERT_EQ(VPX_CODEC_OK, res_init) << DecodeError(); + } + + const vpx_codec_err_t res_dec = vpx_codec_decode(&decoder_, + cxdata, size, NULL, 0); + ASSERT_EQ(VPX_CODEC_OK, res_dec) << DecodeError(); +} + +void DecoderTest::RunLoop(CompressedVideoSource *video) { + vpx_codec_dec_cfg_t dec_cfg = {0}; + Decoder decoder(dec_cfg, 0); + + // Decode frames. + for (video->Begin(); video->cxdata(); video->Next()) { + decoder.DecodeFrame(video->cxdata(), video->frame_size()); + + DxDataIterator dec_iter = decoder.GetDxData(); + const vpx_image_t *img = NULL; + + // Get decompressed data + while ((img = dec_iter.Next())) + DecompressedFrameHook(*img, video->frame_number()); + } +} +#endif +} // namespace libvpx_test diff --git a/test/decode_test_driver.h b/test/decode_test_driver.h new file mode 100644 index 0000000000000000000000000000000000000000..6408bee017320d85f02122f1af7cda9c75bc9033 --- /dev/null +++ b/test/decode_test_driver.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef TEST_DECODE_TEST_DRIVER_H_ +#define TEST_DECODE_TEST_DRIVER_H_ +#include +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "vpx_config.h" +#include "vpx/vpx_decoder.h" +#include "vpx/vp8dx.h" + +namespace libvpx_test { + +class CompressedVideoSource; + +// Provides an object to handle decoding output +class DxDataIterator { + public: + explicit DxDataIterator(vpx_codec_ctx_t *decoder) + : decoder_(decoder), iter_(NULL) {} + + const vpx_image_t *Next() { + return vpx_codec_get_frame(decoder_, &iter_); + } + + private: + vpx_codec_ctx_t *decoder_; + vpx_codec_iter_t iter_; +}; + +// Provides a simplified interface to manage one video decoding. +// +// TODO: similar to Encoder class, the exact services should be +// added as more tests are added. +class Decoder { + public: + Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline) + : cfg_(cfg), deadline_(deadline) { + memset(&decoder_, 0, sizeof(decoder_)); + } + + ~Decoder() { + vpx_codec_destroy(&decoder_); + } + + void DecodeFrame(const uint8_t *cxdata, int size); + + DxDataIterator GetDxData() { + return DxDataIterator(&decoder_); + } + + void set_deadline(unsigned long deadline) { + deadline_ = deadline; + } + + void Control(int ctrl_id, int arg) { + const vpx_codec_err_t res = vpx_codec_control_(&decoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << DecodeError(); + } + + protected: + const char *DecodeError() { + const char *detail = vpx_codec_error_detail(&decoder_); + return detail ? detail : vpx_codec_error(&decoder_); + } + + vpx_codec_ctx_t decoder_; + vpx_codec_dec_cfg_t cfg_; + unsigned int deadline_; +}; + +// Common test functionality for all Decoder tests. +class DecoderTest { + public: + // Main loop. + virtual void RunLoop(CompressedVideoSource *video); + + // Hook to be called on every decompressed frame. + virtual void DecompressedFrameHook(const vpx_image_t& img, + const unsigned int frame_number) {} + + protected: + DecoderTest() {} + + virtual ~DecoderTest() {} +}; + +} // namespace libvpx_test + +#endif // TEST_DECODE_TEST_DRIVER_H_ diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc new file mode 100644 index 0000000000000000000000000000000000000000..ebb3959ed19a81546cbfa097cd84f937f713a758 --- /dev/null +++ b/test/encode_test_driver.cc @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "vpx_config.h" +#include "test/encode_test_driver.h" +#if CONFIG_VP8_DECODER +#include "test/decode_test_driver.h" +#endif +#include "test/video_source.h" +#include "third_party/googletest/src/include/gtest/gtest.h" + +namespace libvpx_test { +void Encoder::EncodeFrame(VideoSource *video, const unsigned long frame_flags) { + if (video->img()) + EncodeFrameInternal(*video, frame_flags); + else + Flush(); + + // Handle twopass stats + CxDataIterator iter = GetCxData(); + + while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) { + if (pkt->kind != VPX_CODEC_STATS_PKT) + continue; + + stats_->Append(*pkt); + } +} + +void Encoder::EncodeFrameInternal(const VideoSource &video, + const unsigned long frame_flags) { + vpx_codec_err_t res; + const vpx_image_t *img = video.img(); + + // Handle first frame initialization + if (!encoder_.priv) { + cfg_.g_w = img->d_w; + cfg_.g_h = img->d_h; + cfg_.g_timebase = video.timebase(); + cfg_.rc_twopass_stats_in = stats_->buf(); + res = vpx_codec_enc_init(&encoder_, &vpx_codec_vp8_cx_algo, &cfg_, + init_flags_); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } + + // Handle frame resizing + if (cfg_.g_w != img->d_w || cfg_.g_h != img->d_h) { + cfg_.g_w = img->d_w; + cfg_.g_h = img->d_h; + res = vpx_codec_enc_config_set(&encoder_, &cfg_); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } + + // Encode the frame + res = vpx_codec_encode(&encoder_, + video.img(), video.pts(), video.duration(), + frame_flags, deadline_); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); +} + +void Encoder::Flush() { + const vpx_codec_err_t res = vpx_codec_encode(&encoder_, NULL, 0, 0, 0, + deadline_); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); +} + +void EncoderTest::SetMode(TestMode mode) { + switch (mode) { + case kRealTime: + deadline_ = VPX_DL_REALTIME; + break; + + case kOnePassGood: + case kTwoPassGood: + deadline_ = VPX_DL_GOOD_QUALITY; + break; + + case kOnePassBest: + case kTwoPassBest: + deadline_ = VPX_DL_BEST_QUALITY; + break; + + default: + ASSERT_TRUE(false) << "Unexpected mode " << mode; + } + + if (mode == kTwoPassGood || mode == kTwoPassBest) + passes_ = 2; + else + passes_ = 1; +} +// The function should return "true" most of the time, therefore no early +// break-out is implemented within the match checking process. +static bool compare_img(const vpx_image_t *img1, + const vpx_image_t *img2) { + bool match = (img1->fmt == img2->fmt) && + (img1->d_w == img2->d_w) && + (img1->d_h == img2->d_h); + + const unsigned int width_y = img1->d_w; + const unsigned int height_y = img1->d_h; + unsigned int i; + for (i = 0; i < height_y; ++i) + match = ( memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y], + img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y], + width_y) == 0) && match; + const unsigned int width_uv = (img1->d_w + 1) >> 1; + const unsigned int height_uv = (img1->d_h + 1) >> 1; + for (i = 0; i < height_uv; ++i) + match = ( memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U], + img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U], + width_uv) == 0) && match; + for (i = 0; i < height_uv; ++i) + match = ( memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V], + img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V], + width_uv) == 0) && match; + return match; +} + +void EncoderTest::RunLoop(VideoSource *video) { +#if CONFIG_VP8_DECODER + vpx_codec_dec_cfg_t dec_cfg = {0}; +#endif + + stats_.Reset(); + + for (unsigned int pass = 0; pass < passes_; pass++) { + last_pts_ = 0; + + if (passes_ == 1) + cfg_.g_pass = VPX_RC_ONE_PASS; + else if (pass == 0) + cfg_.g_pass = VPX_RC_FIRST_PASS; + else + cfg_.g_pass = VPX_RC_LAST_PASS; + + BeginPassHook(pass); + Encoder encoder(cfg_, deadline_, init_flags_, &stats_); +#if CONFIG_VP8_DECODER + Decoder decoder(dec_cfg, 0); + bool has_cxdata = false; +#endif + bool again; + for (again = true, video->Begin(); again; video->Next()) { + again = video->img() != NULL; + + PreEncodeFrameHook(video); + PreEncodeFrameHook(video, &encoder); + encoder.EncodeFrame(video, frame_flags_); + + CxDataIterator iter = encoder.GetCxData(); + + while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) { + again = true; + + switch (pkt->kind) { + case VPX_CODEC_CX_FRAME_PKT: +#if CONFIG_VP8_DECODER + has_cxdata = true; + decoder.DecodeFrame((const uint8_t*)pkt->data.frame.buf, + pkt->data.frame.sz); +#endif + ASSERT_GE(pkt->data.frame.pts, last_pts_); + last_pts_ = pkt->data.frame.pts; + FramePktHook(pkt); + break; + + case VPX_CODEC_PSNR_PKT: + PSNRPktHook(pkt); + break; + + default: + break; + } + } + +#if CONFIG_VP8_DECODER + if (has_cxdata) { + const vpx_image_t *img_enc = encoder.GetPreviewFrame(); + DxDataIterator dec_iter = decoder.GetDxData(); + const vpx_image_t *img_dec = dec_iter.Next(); + if(img_enc && img_dec) { + const bool res = compare_img(img_enc, img_dec); + ASSERT_TRUE(res)<< "Encoder/Decoder mismatch found."; + } + } +#endif + if (!Continue()) + break; + } + + EndPassHook(); + + if (!Continue()) + break; + } +} +} // namespace libvpx_test diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h new file mode 100644 index 0000000000000000000000000000000000000000..0141fa9107c2c2a6604643cbe0ccb88819b0c702 --- /dev/null +++ b/test/encode_test_driver.h @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef TEST_ENCODE_TEST_DRIVER_H_ +#define TEST_ENCODE_TEST_DRIVER_H_ +#include +#include +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "vpx/vpx_encoder.h" +#include "vpx/vp8cx.h" + +namespace libvpx_test { + +class VideoSource; + +enum TestMode { + kRealTime, + kOnePassGood, + kOnePassBest, + kTwoPassGood, + kTwoPassBest +}; +#define ALL_TEST_MODES ::testing::Values(::libvpx_test::kRealTime, \ + ::libvpx_test::kOnePassGood, \ + ::libvpx_test::kOnePassBest, \ + ::libvpx_test::kTwoPassGood, \ + ::libvpx_test::kTwoPassBest) + +#define ONE_PASS_TEST_MODES ::testing::Values(::libvpx_test::kRealTime, \ + ::libvpx_test::kOnePassGood, \ + ::libvpx_test::kOnePassBest) + + +// Provides an object to handle the libvpx get_cx_data() iteration pattern +class CxDataIterator { + public: + explicit CxDataIterator(vpx_codec_ctx_t *encoder) + : encoder_(encoder), iter_(NULL) {} + + const vpx_codec_cx_pkt_t *Next() { + return vpx_codec_get_cx_data(encoder_, &iter_); + } + + private: + vpx_codec_ctx_t *encoder_; + vpx_codec_iter_t iter_; +}; + +// Implements an in-memory store for libvpx twopass statistics +class TwopassStatsStore { + public: + void Append(const vpx_codec_cx_pkt_t &pkt) { + buffer_.append(reinterpret_cast(pkt.data.twopass_stats.buf), + pkt.data.twopass_stats.sz); + } + + vpx_fixed_buf_t buf() { + const vpx_fixed_buf_t buf = { &buffer_[0], buffer_.size() }; + return buf; + } + + void Reset() { + buffer_.clear(); + } + + protected: + std::string buffer_; +}; + + +// Provides a simplified interface to manage one video encoding pass, given +// a configuration and video source. +// +// TODO(jkoleszar): The exact services it provides and the appropriate +// level of abstraction will be fleshed out as more tests are written. +class Encoder { + public: + Encoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline, + const unsigned long init_flags, TwopassStatsStore *stats) + : cfg_(cfg), deadline_(deadline), init_flags_(init_flags), stats_(stats) { + memset(&encoder_, 0, sizeof(encoder_)); + } + + ~Encoder() { + vpx_codec_destroy(&encoder_); + } + + CxDataIterator GetCxData() { + return CxDataIterator(&encoder_); + } + + const vpx_image_t *GetPreviewFrame() { + return vpx_codec_get_preview_frame(&encoder_); + } + // This is a thin wrapper around vpx_codec_encode(), so refer to + // vpx_encoder.h for its semantics. + void EncodeFrame(VideoSource *video, const unsigned long frame_flags); + + // Convenience wrapper for EncodeFrame() + void EncodeFrame(VideoSource *video) { + EncodeFrame(video, 0); + } + + void Control(int ctrl_id, int arg) { + const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } + + void set_deadline(unsigned long deadline) { + deadline_ = deadline; + } + + protected: + const char *EncoderError() { + const char *detail = vpx_codec_error_detail(&encoder_); + return detail ? detail : vpx_codec_error(&encoder_); + } + + // Encode an image + void EncodeFrameInternal(const VideoSource &video, + const unsigned long frame_flags); + + // Flush the encoder on EOS + void Flush(); + + vpx_codec_ctx_t encoder_; + vpx_codec_enc_cfg_t cfg_; + unsigned long deadline_; + unsigned long init_flags_; + TwopassStatsStore *stats_; +}; + +// Common test functionality for all Encoder tests. +// +// This class is a mixin which provides the main loop common to all +// encoder tests. It provides hooks which can be overridden by subclasses +// to implement each test's specific behavior, while centralizing the bulk +// of the boilerplate. Note that it doesn't inherit the gtest testing +// classes directly, so that tests can be parameterized differently. +class EncoderTest { + protected: + EncoderTest() : abort_(false), init_flags_(0), frame_flags_(0), + last_pts_(0) {} + + virtual ~EncoderTest() {} + + // Initialize the cfg_ member with the default configuration. + void InitializeConfig() { + const vpx_codec_err_t res = vpx_codec_enc_config_default( + &vpx_codec_vp8_cx_algo, &cfg_, 0); + ASSERT_EQ(VPX_CODEC_OK, res); + } + + // Map the TestMode enum to the deadline_ and passes_ variables. + void SetMode(TestMode mode); + + // Main loop. + virtual void RunLoop(VideoSource *video); + + // Hook to be called at the beginning of a pass. + virtual void BeginPassHook(unsigned int pass) {} + + // Hook to be called at the end of a pass. + virtual void EndPassHook() {} + + // Hook to be called before encoding a frame. + virtual void PreEncodeFrameHook(VideoSource *video) {} + virtual void PreEncodeFrameHook(VideoSource *video, Encoder *encoder) {} + + // Hook to be called on every compressed data packet. + virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {} + + // Hook to be called on every PSNR packet. + virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {} + + // Hook to determine whether the encode loop should continue. + virtual bool Continue() const { return !abort_; } + + bool abort_; + vpx_codec_enc_cfg_t cfg_; + unsigned int passes_; + unsigned long deadline_; + TwopassStatsStore stats_; + unsigned long init_flags_; + unsigned long frame_flags_; + vpx_codec_pts_t last_pts_; +}; + +} // namespace libvpx_test + +#endif // TEST_ENCODE_TEST_DRIVER_H_ diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..25c67310aa0c665aa599c3074345fde318b42928 --- /dev/null +++ b/test/error_resilience_test.cc @@ -0,0 +1,90 @@ +/* + Copyright (c) 2012 The WebM project authors. All Rights Reserved. + + Use of this source code is governed by a BSD-style license + that can be found in the LICENSE file in the root of the source + tree. An additional intellectual property rights grant can be found + in the file PATENTS. All contributing project authors may + be found in the AUTHORS file in the root of the source tree. +*/ +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" + +namespace { + +class ErrorResilienceTest : public libvpx_test::EncoderTest, + public ::testing::TestWithParam { + protected: + ErrorResilienceTest() { + psnr_ = 0.0; + nframes_ = 0; + encoding_mode_ = static_cast(GetParam()); + } + virtual ~ErrorResilienceTest() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(encoding_mode_); + } + + virtual void BeginPassHook(unsigned int /*pass*/) { + psnr_ = 0.0; + nframes_ = 0; + } + + virtual bool Continue() const { + return !HasFatalFailure() && !abort_; + } + + virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { + psnr_ += pkt->data.psnr.psnr[0]; + nframes_++; + } + + double GetAveragePsnr() const { + if (nframes_) + return psnr_ / nframes_; + return 0.0; + } + + private: + double psnr_; + unsigned int nframes_; + libvpx_test::TestMode encoding_mode_; +}; + +TEST_P(ErrorResilienceTest, OnVersusOff) { + const vpx_rational timebase = { 33333333, 1000000000 }; + cfg_.g_timebase = timebase; + cfg_.rc_target_bitrate = 2000; + cfg_.g_lag_in_frames = 25; + + init_flags_ = VPX_CODEC_USE_PSNR; + + libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + timebase.den, timebase.num, 0, 30); + + // Error resilient mode OFF. + cfg_.g_error_resilient = 0; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + const double psnr_resilience_off = GetAveragePsnr(); + EXPECT_GT(psnr_resilience_off, 25.0); + + // Error resilient mode ON. + cfg_.g_error_resilient = 1; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + const double psnr_resilience_on = GetAveragePsnr(); + EXPECT_GT(psnr_resilience_on, 25.0); + + // Test that turning on error resilient mode hurts by 10% at most. + if (psnr_resilience_off > 0.0) { + const double psnr_ratio = psnr_resilience_on / psnr_resilience_off; + EXPECT_GE(psnr_ratio, 0.9); + EXPECT_LE(psnr_ratio, 1.1); + } +} + +INSTANTIATE_TEST_CASE_P(OnOffTest, ErrorResilienceTest, + ONE_PASS_TEST_MODES); +} // namespace diff --git a/test/i420_video_source.h b/test/i420_video_source.h new file mode 100644 index 0000000000000000000000000000000000000000..219bd3393c864567b3ce58fbbd55cec45edfdc3f --- /dev/null +++ b/test/i420_video_source.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef TEST_I420_VIDEO_SOURCE_H_ +#define TEST_I420_VIDEO_SOURCE_H_ +#include +#include + +#include "test/video_source.h" + +namespace libvpx_test { + +// This class extends VideoSource to allow parsing of raw yv12 +// so that we can do actual file encodes. +class I420VideoSource : public VideoSource { + public: + I420VideoSource(const std::string &file_name, + unsigned int width, unsigned int height, + int rate_numerator, int rate_denominator, + unsigned int start, int limit) + : file_name_(file_name), + input_file_(NULL), + img_(NULL), + start_(start), + limit_(limit), + frame_(0), + width_(0), + height_(0), + framerate_numerator_(rate_numerator), + framerate_denominator_(rate_denominator) { + + // This initializes raw_sz_, width_, height_ and allocates an img. + SetSize(width, height); + } + + virtual ~I420VideoSource() { + vpx_img_free(img_); + if (input_file_) + fclose(input_file_); + } + + virtual void Begin() { + if (input_file_) + fclose(input_file_); + input_file_ = OpenTestDataFile(file_name_); + ASSERT_TRUE(input_file_) << "Input file open failed. Filename: " + << file_name_; + if (start_) { + fseek(input_file_, raw_sz_ * start_, SEEK_SET); + } + + frame_ = start_; + FillFrame(); + } + + virtual void Next() { + ++frame_; + FillFrame(); + } + + virtual vpx_image_t *img() const { return (frame_ < limit_) ? img_ : NULL; } + + // Models a stream where Timebase = 1/FPS, so pts == frame. + virtual vpx_codec_pts_t pts() const { return frame_; } + + virtual unsigned long duration() const { return 1; } + + virtual vpx_rational_t timebase() const { + const vpx_rational_t t = { framerate_denominator_, framerate_numerator_ }; + return t; + } + + virtual unsigned int frame() const { return frame_; } + + virtual unsigned int limit() const { return limit_; } + + void SetSize(unsigned int width, unsigned int height) { + if (width != width_ || height != height_) { + vpx_img_free(img_); + img_ = vpx_img_alloc(NULL, VPX_IMG_FMT_VPXI420, width, height, 1); + ASSERT_TRUE(img_ != NULL); + width_ = width; + height_ = height; + raw_sz_ = width * height * 3 / 2; + } + } + + virtual void FillFrame() { + // Read a frame from input_file. + if (fread(img_->img_data, raw_sz_, 1, input_file_) == 0) { + limit_ = frame_; + } + } + + protected: + std::string file_name_; + FILE *input_file_; + vpx_image_t *img_; + size_t raw_sz_; + unsigned int start_; + unsigned int limit_; + unsigned int frame_; + unsigned int width_; + unsigned int height_; + unsigned int framerate_numerator_; + unsigned int framerate_denominator_; +}; + +} // namespace libvpx_test + +#endif // TEST_I420_VIDEO_SOURCE_H_ diff --git a/test/idctllm_test.cc b/test/idctllm_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..dd42e2299952576abfc5c3e0dd064ac7445070db --- /dev/null +++ b/test/idctllm_test.cc @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +extern "C" { +#include "vpx_config.h" +#include "vpx_rtcd.h" +} +#include "third_party/googletest/src/include/gtest/gtest.h" + +typedef void (*idct_fn_t)(short *input, unsigned char *pred_ptr, + int pred_stride, unsigned char *dst_ptr, + int dst_stride); +namespace { +class IDCTTest : public ::testing::TestWithParam +{ + protected: + virtual void SetUp() + { + int i; + + UUT = GetParam(); + memset(input, 0, sizeof(input)); + /* Set up guard blocks */ + for(i=0; i<256; i++) + output[i] = ((i&0xF)<4&&(i<64))?0:-1; + } + + idct_fn_t UUT; + short input[16]; + unsigned char output[256]; + unsigned char predict[256]; +}; + +TEST_P(IDCTTest, TestGuardBlocks) +{ + int i; + + for(i=0; i<256; i++) + if((i&0xF) < 4 && i<64) + EXPECT_EQ(0, output[i]) << i; + else + EXPECT_EQ(255, output[i]); +} + +TEST_P(IDCTTest, TestAllZeros) +{ + int i; + + UUT(input, output, 16, output, 16); + + for(i=0; i<256; i++) + if((i&0xF) < 4 && i<64) + EXPECT_EQ(0, output[i]) << "i==" << i; + else + EXPECT_EQ(255, output[i]) << "i==" << i; +} + +TEST_P(IDCTTest, TestAllOnes) +{ + int i; + + input[0] = 4; + UUT(input, output, 16, output, 16); + + for(i=0; i<256; i++) + if((i&0xF) < 4 && i<64) + EXPECT_EQ(1, output[i]) << "i==" << i; + else + EXPECT_EQ(255, output[i]) << "i==" << i; +} + +TEST_P(IDCTTest, TestAddOne) +{ + int i; + + for(i=0; i<256; i++) + predict[i] = i; + + input[0] = 4; + UUT(input, predict, 16, output, 16); + + for(i=0; i<256; i++) + if((i&0xF) < 4 && i<64) + EXPECT_EQ(i+1, output[i]) << "i==" << i; + else + EXPECT_EQ(255, output[i]) << "i==" << i; +} + +TEST_P(IDCTTest, TestWithData) +{ + int i; + + for(i=0; i<16; i++) + input[i] = i; + + UUT(input, output, 16, output, 16); + + for(i=0; i<256; i++) + if((i&0xF) > 3 || i>63) + EXPECT_EQ(255, output[i]) << "i==" << i; + else if(i == 0) + EXPECT_EQ(11, output[i]) << "i==" << i; + else if(i == 34) + EXPECT_EQ(1, output[i]) << "i==" << i; + else if(i == 2 || i == 17 || i == 32) + EXPECT_EQ(3, output[i]) << "i==" << i; + else + EXPECT_EQ(0, output[i]) << "i==" << i; +} + +INSTANTIATE_TEST_CASE_P(C, IDCTTest, + ::testing::Values(vp8_short_idct4x4llm_c)); +#if HAVE_MMX +INSTANTIATE_TEST_CASE_P(MMX, IDCTTest, + ::testing::Values(vp8_short_idct4x4llm_mmx)); +#endif +} diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..d2e0d61a128208efc9fcd2f0ab9c73cee14eabb5 --- /dev/null +++ b/test/intrapred_test.cc @@ -0,0 +1,354 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include +#include "test/acm_random.h" +#include "third_party/googletest/src/include/gtest/gtest.h" +extern "C" { +#include "vpx_config.h" +#include "vpx_rtcd.h" +#include "vp8/common/blockd.h" +#include "vpx_mem/vpx_mem.h" +} + +namespace { + +using libvpx_test::ACMRandom; + +class IntraPredBase { + protected: + void SetupMacroblock(uint8_t *data, int block_size, int stride, + int num_planes) { + memset(&mb_, 0, sizeof(mb_)); + memset(&mi_, 0, sizeof(mi_)); + mb_.up_available = 1; + mb_.left_available = 1; + mb_.mode_info_context = &mi_; + stride_ = stride; + block_size_ = block_size; + num_planes_ = num_planes; + for (int p = 0; p < num_planes; p++) + data_ptr_[p] = data + stride * (block_size + 1) * p + + stride + block_size; + } + + void FillRandom() { + // Fill edges with random data + ACMRandom rnd(ACMRandom::DeterministicSeed()); + for (int p = 0; p < num_planes_; p++) { + for (int x = -1 ; x <= block_size_; x++) + data_ptr_[p][x - stride_] = rnd.Rand8(); + for (int y = 0; y < block_size_; y++) + data_ptr_[p][y * stride_ - 1] = rnd.Rand8(); + } + } + + virtual void Predict(MB_PREDICTION_MODE mode) = 0; + + void SetLeftUnavailable() { + mb_.left_available = 0; + for (int p = 0; p < num_planes_; p++) + for (int i = -1; i < block_size_; ++i) + data_ptr_[p][stride_ * i - 1] = 129; + } + + void SetTopUnavailable() { + mb_.up_available = 0; + for (int p = 0; p < num_planes_; p++) + memset(&data_ptr_[p][-1 - stride_], 127, block_size_ + 2); + } + + void SetTopLeftUnavailable() { + SetLeftUnavailable(); + SetTopUnavailable(); + } + + int BlockSizeLog2Min1() const { + switch (block_size_) { + case 16: + return 3; + case 8: + return 2; + default: + return 0; + } + } + + // check DC prediction output against a reference + void CheckDCPrediction() const { + for (int p = 0; p < num_planes_; p++) { + // calculate expected DC + int expected; + if (mb_.up_available || mb_.left_available) { + int sum = 0, shift = BlockSizeLog2Min1() + mb_.up_available + + mb_.left_available; + if (mb_.up_available) + for (int x = 0; x < block_size_; x++) + sum += data_ptr_[p][x - stride_]; + if (mb_.left_available) + for (int y = 0; y < block_size_; y++) + sum += data_ptr_[p][y * stride_ - 1]; + expected = (sum + (1 << (shift - 1))) >> shift; + } else + expected = 0x80; + + // check that all subsequent lines are equal to the first + for (int y = 1; y < block_size_; ++y) + ASSERT_EQ(0, memcmp(data_ptr_[p], &data_ptr_[p][y * stride_], + block_size_)); + // within the first line, ensure that each pixel has the same value + for (int x = 1; x < block_size_; ++x) + ASSERT_EQ(data_ptr_[p][0], data_ptr_[p][x]); + // now ensure that that pixel has the expected (DC) value + ASSERT_EQ(expected, data_ptr_[p][0]); + } + } + + // check V prediction output against a reference + void CheckVPrediction() const { + // check that all lines equal the top border + for (int p = 0; p < num_planes_; p++) + for (int y = 0; y < block_size_; y++) + ASSERT_EQ(0, memcmp(&data_ptr_[p][-stride_], + &data_ptr_[p][y * stride_], block_size_)); + } + + // check H prediction output against a reference + void CheckHPrediction() const { + // for each line, ensure that each pixel is equal to the left border + for (int p = 0; p < num_planes_; p++) + for (int y = 0; y < block_size_; y++) + for (int x = 0; x < block_size_; x++) + ASSERT_EQ(data_ptr_[p][-1 + y * stride_], + data_ptr_[p][x + y * stride_]); + } + + static int ClipByte(int value) { + if (value > 255) + return 255; + else if (value < 0) + return 0; + return value; + } + + // check TM prediction output against a reference + void CheckTMPrediction() const { + for (int p = 0; p < num_planes_; p++) + for (int y = 0; y < block_size_; y++) + for (int x = 0; x < block_size_; x++) { + const int expected = ClipByte(data_ptr_[p][x - stride_] + + data_ptr_[p][stride_ * y - 1] + - data_ptr_[p][-1 - stride_]); + ASSERT_EQ(expected, data_ptr_[p][y * stride_ + x]); + } + } + + // Actual test + void RunTest() { + { + SCOPED_TRACE("DC_PRED"); + FillRandom(); + Predict(DC_PRED); + CheckDCPrediction(); + } + { + SCOPED_TRACE("DC_PRED LEFT"); + FillRandom(); + SetLeftUnavailable(); + Predict(DC_PRED); + CheckDCPrediction(); + } + { + SCOPED_TRACE("DC_PRED TOP"); + FillRandom(); + SetTopUnavailable(); + Predict(DC_PRED); + CheckDCPrediction(); + } + { + SCOPED_TRACE("DC_PRED TOP_LEFT"); + FillRandom(); + SetTopLeftUnavailable(); + Predict(DC_PRED); + CheckDCPrediction(); + } + { + SCOPED_TRACE("H_PRED"); + FillRandom(); + Predict(H_PRED); + CheckHPrediction(); + } + { + SCOPED_TRACE("V_PRED"); + FillRandom(); + Predict(V_PRED); + CheckVPrediction(); + } + { + SCOPED_TRACE("TM_PRED"); + FillRandom(); + Predict(TM_PRED); + CheckTMPrediction(); + } + } + + MACROBLOCKD mb_; + MODE_INFO mi_; + uint8_t *data_ptr_[2]; // in the case of Y, only [0] is used + int stride_; + int block_size_; + int num_planes_; +}; + +typedef void (*intra_pred_y_fn_t)(MACROBLOCKD *x, + uint8_t *yabove_row, + uint8_t *yleft, + int left_stride, + uint8_t *ypred_ptr, + int y_stride); + +class IntraPredYTest : public ::testing::TestWithParam, + protected IntraPredBase { + public: + static void SetUpTestCase() { + data_array_ = reinterpret_cast( + vpx_memalign(kDataAlignment, kDataBufferSize)); + } + + static void TearDownTestCase() { + vpx_free(data_array_); + data_array_ = NULL; + } + + protected: + static const int kBlockSize = 16; + static const int kDataAlignment = 16; + static const int kStride = kBlockSize * 3; + // We use 48 so that the data pointer of the first pixel in each row of + // each macroblock is 16-byte aligned, and this gives us access to the + // top-left and top-right corner pixels belonging to the top-left/right + // macroblocks. + // We use 17 lines so we have one line above us for top-prediction. + static const int kDataBufferSize = kStride * (kBlockSize + 1); + + virtual void SetUp() { + pred_fn_ = GetParam(); + SetupMacroblock(data_array_, kBlockSize, kStride, 1); + } + + virtual void Predict(MB_PREDICTION_MODE mode) { + mb_.mode_info_context->mbmi.mode = mode; + pred_fn_(&mb_, data_ptr_[0] - kStride, data_ptr_[0] - 1, kStride, + data_ptr_[0], kStride); + } + + intra_pred_y_fn_t pred_fn_; + static uint8_t* data_array_; +}; + +uint8_t* IntraPredYTest::data_array_ = NULL; + +TEST_P(IntraPredYTest, IntraPredTests) { + RunTest(); +} + +INSTANTIATE_TEST_CASE_P(C, IntraPredYTest, + ::testing::Values( + vp8_build_intra_predictors_mby_s_c)); +#if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P(SSE2, IntraPredYTest, + ::testing::Values( + vp8_build_intra_predictors_mby_s_sse2)); +#endif +#if HAVE_SSSE3 +INSTANTIATE_TEST_CASE_P(SSSE3, IntraPredYTest, + ::testing::Values( + vp8_build_intra_predictors_mby_s_ssse3)); +#endif + +typedef void (*intra_pred_uv_fn_t)(MACROBLOCKD *x, + uint8_t *uabove_row, + uint8_t *vabove_row, + uint8_t *uleft, + uint8_t *vleft, + int left_stride, + uint8_t *upred_ptr, + uint8_t *vpred_ptr, + int pred_stride); + +class IntraPredUVTest : public ::testing::TestWithParam, + protected IntraPredBase { + public: + static void SetUpTestCase() { + data_array_ = reinterpret_cast( + vpx_memalign(kDataAlignment, kDataBufferSize)); + } + + static void TearDownTestCase() { + vpx_free(data_array_); + data_array_ = NULL; + } + + protected: + static const int kBlockSize = 8; + static const int kDataAlignment = 8; + static const int kStride = kBlockSize * 3; + // We use 24 so that the data pointer of the first pixel in each row of + // each macroblock is 8-byte aligned, and this gives us access to the + // top-left and top-right corner pixels belonging to the top-left/right + // macroblocks. + // We use 9 lines so we have one line above us for top-prediction. + // [0] = U, [1] = V + static const int kDataBufferSize = 2 * kStride * (kBlockSize + 1); + + virtual void SetUp() { + pred_fn_ = GetParam(); + SetupMacroblock(data_array_, kBlockSize, kStride, 2); + } + + virtual void Predict(MB_PREDICTION_MODE mode) { + mb_.mode_info_context->mbmi.uv_mode = mode; + pred_fn_(&mb_, data_ptr_[0] - kStride, data_ptr_[1] - kStride, + data_ptr_[0] - 1, data_ptr_[1] - 1, kStride, + data_ptr_[0], data_ptr_[1], kStride); + } + + intra_pred_uv_fn_t pred_fn_; + // We use 24 so that the data pointer of the first pixel in each row of + // each macroblock is 8-byte aligned, and this gives us access to the + // top-left and top-right corner pixels belonging to the top-left/right + // macroblocks. + // We use 9 lines so we have one line above us for top-prediction. + // [0] = U, [1] = V + static uint8_t* data_array_; +}; + +uint8_t* IntraPredUVTest::data_array_ = NULL; + +TEST_P(IntraPredUVTest, IntraPredTests) { + RunTest(); +} + +INSTANTIATE_TEST_CASE_P(C, IntraPredUVTest, + ::testing::Values( + vp8_build_intra_predictors_mbuv_s_c)); +#if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P(SSE2, IntraPredUVTest, + ::testing::Values( + vp8_build_intra_predictors_mbuv_s_sse2)); +#endif +#if HAVE_SSSE3 +INSTANTIATE_TEST_CASE_P(SSSE3, IntraPredUVTest, + ::testing::Values( + vp8_build_intra_predictors_mbuv_s_ssse3)); +#endif + +} // namespace diff --git a/test/ivf_video_source.h b/test/ivf_video_source.h new file mode 100644 index 0000000000000000000000000000000000000000..48c3a7dcd8f21e187f6c486ece38e215d56c5988 --- /dev/null +++ b/test/ivf_video_source.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef TEST_IVF_VIDEO_SOURCE_H_ +#define TEST_IVF_VIDEO_SOURCE_H_ +#include +#include +#include +#include +#include "test/video_source.h" + +namespace libvpx_test { +const unsigned int kCodeBufferSize = 256 * 1024; +const unsigned int kIvfFileHdrSize = 32; +const unsigned int kIvfFrameHdrSize = 12; + +static unsigned int MemGetLe32(const uint8_t *mem) { + return (mem[3] << 24) | (mem[2] << 16) | (mem[1] << 8) | (mem[0]); +} + +// This class extends VideoSource to allow parsing of ivf files, +// so that we can do actual file decodes. +class IVFVideoSource : public CompressedVideoSource { + public: + IVFVideoSource(const std::string &file_name) + : file_name_(file_name), + input_file_(NULL), + compressed_frame_buf_(NULL), + frame_sz_(0), + frame_(0), + end_of_file_(false) { + } + + virtual ~IVFVideoSource() { + delete[] compressed_frame_buf_; + + if (input_file_) + fclose(input_file_); + } + + virtual void Init() { + // Allocate a buffer for read in the compressed video frame. + compressed_frame_buf_ = new uint8_t[libvpx_test::kCodeBufferSize]; + ASSERT_TRUE(compressed_frame_buf_) << "Allocate frame buffer failed"; + } + + virtual void Begin() { + input_file_ = OpenTestDataFile(file_name_); + ASSERT_TRUE(input_file_) << "Input file open failed. Filename: " + << file_name_; + + // Read file header + uint8_t file_hdr[kIvfFileHdrSize]; + ASSERT_EQ(kIvfFileHdrSize, fread(file_hdr, 1, kIvfFileHdrSize, input_file_)) + << "File header read failed."; + // Check file header + ASSERT_TRUE(file_hdr[0] == 'D' && file_hdr[1] == 'K' && file_hdr[2] == 'I' + && file_hdr[3] == 'F') << "Input is not an IVF file."; + + FillFrame(); + } + + virtual void Next() { + ++frame_; + FillFrame(); + } + + void FillFrame() { + uint8_t frame_hdr[kIvfFrameHdrSize]; + // Check frame header and read a frame from input_file. + if (fread(frame_hdr, 1, kIvfFrameHdrSize, input_file_) + != kIvfFrameHdrSize) { + end_of_file_ = true; + } else { + end_of_file_ = false; + + frame_sz_ = MemGetLe32(frame_hdr); + ASSERT_LE(frame_sz_, kCodeBufferSize) + << "Frame is too big for allocated code buffer"; + ASSERT_EQ(frame_sz_, + fread(compressed_frame_buf_, 1, frame_sz_, input_file_)) + << "Failed to read complete frame"; + } + } + + virtual const uint8_t *cxdata() const { + return end_of_file_ ? NULL : compressed_frame_buf_; + } + virtual const unsigned int frame_size() const { return frame_sz_; } + virtual const unsigned int frame_number() const { return frame_; } + + protected: + std::string file_name_; + FILE *input_file_; + uint8_t *compressed_frame_buf_; + unsigned int frame_sz_; + unsigned int frame_; + bool end_of_file_; +}; + +} // namespace libvpx_test + +#endif // TEST_IVF_VIDEO_SOURCE_H_ diff --git a/test/keyframe_test.cc b/test/keyframe_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..d0c81df99825a8e1a5f50505ec477b39ed89a438 --- /dev/null +++ b/test/keyframe_test.cc @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include +#include +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "third_party/googletest/src/include/gtest/gtest.h" + +namespace { + +class KeyframeTest : public ::libvpx_test::EncoderTest, + public ::testing::TestWithParam { + protected: + virtual void SetUp() { + InitializeConfig(); + SetMode(GetParam()); + kf_count_ = 0; + kf_count_max_ = INT_MAX; + kf_do_force_kf_ = false; + set_cpu_used_ = 0; + } + + virtual bool Continue() const { + return !HasFatalFailure() && !abort_; + } + + virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) { + if (kf_do_force_kf_) + frame_flags_ = (video->frame() % 3) ? 0 : VPX_EFLAG_FORCE_KF; + if (set_cpu_used_ && video->frame() == 1) + encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); + } + + virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) { + kf_pts_list_.push_back(pkt->data.frame.pts); + kf_count_++; + abort_ |= kf_count_ > kf_count_max_; + } + } + + bool kf_do_force_kf_; + int kf_count_; + int kf_count_max_; + std::vector kf_pts_list_; + int set_cpu_used_; +}; + +TEST_P(KeyframeTest, TestRandomVideoSource) { + // Validate that encoding the RandomVideoSource produces multiple keyframes. + // This validates the results of the TestDisableKeyframes test. + kf_count_max_ = 2; // early exit successful tests. + + ::libvpx_test::RandomVideoSource video; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + // In realtime mode - auto placed keyframes are exceedingly rare, don't + // bother with this check if(GetParam() > 0) + if(GetParam() > 0) + EXPECT_GT(kf_count_, 1); +} + +TEST_P(KeyframeTest, TestDisableKeyframes) { + cfg_.kf_mode = VPX_KF_DISABLED; + kf_count_max_ = 1; // early exit failed tests. + + ::libvpx_test::RandomVideoSource video; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + EXPECT_EQ(1, kf_count_); +} + +TEST_P(KeyframeTest, TestForceKeyframe) { + cfg_.kf_mode = VPX_KF_DISABLED; + kf_do_force_kf_ = true; + + ::libvpx_test::DummyVideoSource video; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + // verify that every third frame is a keyframe. + for (std::vector::const_iterator iter = kf_pts_list_.begin(); + iter != kf_pts_list_.end(); ++iter) { + ASSERT_EQ(0, *iter % 3) << "Unexpected keyframe at frame " << *iter; + } +} + +TEST_P(KeyframeTest, TestKeyframeMaxDistance) { + cfg_.kf_max_dist = 25; + + ::libvpx_test::DummyVideoSource video; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + // verify that keyframe interval matches kf_max_dist + for (std::vector::const_iterator iter = kf_pts_list_.begin(); + iter != kf_pts_list_.end(); ++iter) { + ASSERT_EQ(0, *iter % 25) << "Unexpected keyframe at frame " << *iter; + } +} + +TEST_P(KeyframeTest, TestAutoKeyframe) { + cfg_.kf_mode = VPX_KF_AUTO; + kf_do_force_kf_ = false; + + // Force a deterministic speed step in Real Time mode, as the faster modes + // may not produce a keyframe like we expect. This is necessary when running + // on very slow environments (like Valgrind). The step -11 was determined + // experimentally as the fastest mode that still throws the keyframe. + if (deadline_ == VPX_DL_REALTIME) + set_cpu_used_ = -11; + + // This clip has a cut scene every 30 frames -> Frame 0, 30, 60, 90, 120. + // I check only the first 40 frames to make sure there's a keyframe at frame + // 0 and 30. + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 40); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + // In realtime mode - auto placed keyframes are exceedingly rare, don't + // bother with this check + if(GetParam() > 0) + EXPECT_EQ(2u, kf_pts_list_.size()) << " Not the right number of keyframes "; + + // Verify that keyframes match the file keyframes in the file. + for (std::vector::const_iterator iter = kf_pts_list_.begin(); + iter != kf_pts_list_.end(); ++iter) { + + if (deadline_ == VPX_DL_REALTIME && *iter > 0) + EXPECT_EQ(0, (*iter - 1) % 30) << "Unexpected keyframe at frame " + << *iter; + else + EXPECT_EQ(0, *iter % 30) << "Unexpected keyframe at frame " << *iter; + } +} + +INSTANTIATE_TEST_CASE_P(AllModes, KeyframeTest, ALL_TEST_MODES); +} // namespace diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..af2f3bda9d1d4095a43b375055e6e43b7a7ac7d1 --- /dev/null +++ b/test/pp_filter_test.cc @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "third_party/googletest/src/include/gtest/gtest.h" +extern "C" { +#include "vpx_config.h" +#include "vpx_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_mem/vpx_mem.h" +} + +typedef void (*post_proc_func_t)(unsigned char *src_ptr, + unsigned char *dst_ptr, + int src_pixels_per_line, + int dst_pixels_per_line, + int cols, + unsigned char *flimit, + int size); + +namespace { + +class Vp8PostProcessingFilterTest + : public ::testing::TestWithParam {}; + +// Test routine for the VP8 post-processing function +// vp8_post_proc_down_and_across_mb_row_c. + +TEST_P(Vp8PostProcessingFilterTest, FilterOutputCheck) { + // Size of the underlying data block that will be filtered. + const int block_width = 16; + const int block_height = 16; + + // 5-tap filter needs 2 padding rows above and below the block in the input. + const int input_width = block_width; + const int input_height = block_height + 4; + const int input_stride = input_width; + const int input_size = input_width * input_height; + + // Filter extends output block by 8 samples at left and right edges. + const int output_width = block_width + 16; + const int output_height = block_height; + const int output_stride = output_width; + const int output_size = output_width * output_height; + + uint8_t *const src_image = + reinterpret_cast(vpx_calloc(input_size, 1)); + uint8_t *const dst_image = + reinterpret_cast(vpx_calloc(output_size, 1)); + + // Pointers to top-left pixel of block in the input and output images. + uint8_t *const src_image_ptr = src_image + (input_stride << 1); + uint8_t *const dst_image_ptr = dst_image + 8; + uint8_t *const flimits = reinterpret_cast(vpx_memalign(16, block_width)); + (void)vpx_memset(flimits, 255, block_width); + + // Initialize pixels in the input: + // block pixels to value 1, + // border pixels to value 10. + (void)vpx_memset(src_image, 10, input_size); + uint8_t *pixel_ptr = src_image_ptr; + for (int i = 0; i < block_height; ++i) { + for (int j = 0; j < block_width; ++j) { + pixel_ptr[j] = 1; + } + pixel_ptr += input_stride; + } + + // Initialize pixels in the output to 99. + (void)vpx_memset(dst_image, 99, output_size); + + GetParam()(src_image_ptr, dst_image_ptr, input_stride, + output_stride, block_width, flimits, 16); + + static const uint8_t expected_data[block_height] = { + 4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4 + }; + + pixel_ptr = dst_image_ptr; + for (int i = 0; i < block_height; ++i) { + for (int j = 0; j < block_width; ++j) { + EXPECT_EQ(expected_data[i], pixel_ptr[j]) + << "Vp8PostProcessingFilterTest failed with invalid filter output"; + } + pixel_ptr += output_stride; + } + + vpx_free(src_image); + vpx_free(dst_image); + vpx_free(flimits); +}; + +INSTANTIATE_TEST_CASE_P(C, Vp8PostProcessingFilterTest, + ::testing::Values(vp8_post_proc_down_and_across_mb_row_c)); + +#if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P(SSE2, Vp8PostProcessingFilterTest, + ::testing::Values(vp8_post_proc_down_and_across_mb_row_sse2)); +#endif + +} // namespace diff --git a/test/resize_test.cc b/test/resize_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..c846157eadcd9bf09f7bb2e6b27f3f80b5a48723 --- /dev/null +++ b/test/resize_test.cc @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include +#include +#include "test/encode_test_driver.h" +#include "test/video_source.h" +#include "third_party/googletest/src/include/gtest/gtest.h" + +namespace { + +const unsigned int kInitialWidth = 320; +const unsigned int kInitialHeight = 240; + +unsigned int ScaleForFrameNumber(unsigned int frame, unsigned int val) { + if (frame < 10) + return val; + if (frame < 20) + return val / 2; + if (frame < 30) + return val * 2 / 3; + if (frame < 40) + return val / 4; + if (frame < 50) + return val * 7 / 8; + return val; +} + +class ResizingVideoSource : public ::libvpx_test::DummyVideoSource { + public: + ResizingVideoSource() { + SetSize(kInitialWidth, kInitialHeight); + limit_ = 60; + } + + protected: + virtual void Next() { + ++frame_; + SetSize(ScaleForFrameNumber(frame_, kInitialWidth), + ScaleForFrameNumber(frame_, kInitialHeight)); + FillFrame(); + } +}; + +class ResizeTest : public ::libvpx_test::EncoderTest, + public ::testing::TestWithParam { + protected: + struct FrameInfo { + FrameInfo(vpx_codec_pts_t _pts, unsigned int _w, unsigned int _h) + : pts(_pts), w(_w), h(_h) {} + + vpx_codec_pts_t pts; + unsigned int w; + unsigned int h; + }; + + virtual void SetUp() { + InitializeConfig(); + SetMode(GetParam()); + } + + virtual bool Continue() const { + return !HasFatalFailure() && !abort_; + } + + virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) { + const unsigned char *buf = + reinterpret_cast(pkt->data.frame.buf); + const unsigned int w = (buf[6] | (buf[7] << 8)) & 0x3fff; + const unsigned int h = (buf[8] | (buf[9] << 8)) & 0x3fff; + + frame_info_list_.push_back(FrameInfo(pkt->data.frame.pts, w, h)); + } + } + + std::vector< FrameInfo > frame_info_list_; +}; + +TEST_P(ResizeTest, TestExternalResizeWorks) { + ResizingVideoSource video; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + for (std::vector::iterator info = frame_info_list_.begin(); + info != frame_info_list_.end(); ++info) { + const vpx_codec_pts_t pts = info->pts; + const unsigned int expected_w = ScaleForFrameNumber(pts, kInitialWidth); + const unsigned int expected_h = ScaleForFrameNumber(pts, kInitialHeight); + + EXPECT_EQ(expected_w, info->w) + << "Frame " << pts << "had unexpected width"; + EXPECT_EQ(expected_h, info->h) + << "Frame " << pts << "had unexpected height"; + } +} + +INSTANTIATE_TEST_CASE_P(OnePass, ResizeTest, ONE_PASS_TEST_MODES); +} // namespace diff --git a/test/sad_test.cc b/test/sad_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..2b562e6dd94332434b69c9015a4ebd5a5839253f --- /dev/null +++ b/test/sad_test.cc @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include +#include +#include + +extern "C" { +#include "./vpx_config.h" +#include "./vpx_rtcd.h" +#include "vp8/common/blockd.h" +#include "vpx_mem/vpx_mem.h" +} + +#include "test/acm_random.h" +#include "test/util.h" +#include "third_party/googletest/src/include/gtest/gtest.h" + + +typedef unsigned int (*sad_m_by_n_fn_t)(const unsigned char *source_ptr, + int source_stride, + const unsigned char *reference_ptr, + int reference_stride, + unsigned int max_sad); + +using libvpx_test::ACMRandom; + +namespace { +class SADTest : public PARAMS(int, int, sad_m_by_n_fn_t) { + public: + static void SetUpTestCase() { + source_data_ = reinterpret_cast( + vpx_memalign(kDataAlignment, kDataBufferSize)); + reference_data_ = reinterpret_cast( + vpx_memalign(kDataAlignment, kDataBufferSize)); + } + + static void TearDownTestCase() { + vpx_free(source_data_); + source_data_ = NULL; + vpx_free(reference_data_); + reference_data_ = NULL; + } + + protected: + static const int kDataAlignment = 16; + static const int kDataBufferSize = 16 * 32; + + virtual void SetUp() { + sad_fn_ = GET_PARAM(2); + height_ = GET_PARAM(1); + width_ = GET_PARAM(0); + source_stride_ = width_ * 2; + reference_stride_ = width_ * 2; + rnd_.Reset(ACMRandom::DeterministicSeed()); + } + + sad_m_by_n_fn_t sad_fn_; + virtual unsigned int SAD(unsigned int max_sad) { + return sad_fn_(source_data_, source_stride_, + reference_data_, reference_stride_, + max_sad); + } + + // Sum of Absolute Differences. Given two blocks, calculate the absolute + // difference between two pixels in the same relative location; accumulate. + unsigned int ReferenceSAD(unsigned int max_sad) { + unsigned int sad = 0; + + for (int h = 0; h < height_; ++h) { + for (int w = 0; w < width_; ++w) { + sad += abs(source_data_[h * source_stride_ + w] + - reference_data_[h * reference_stride_ + w]); + } + if (sad > max_sad) { + break; + } + } + return sad; + } + + void FillConstant(uint8_t *data, int stride, uint8_t fill_constant) { + for (int h = 0; h < height_; ++h) { + for (int w = 0; w < width_; ++w) { + data[h * stride + w] = fill_constant; + } + } + } + + void FillRandom(uint8_t *data, int stride) { + for (int h = 0; h < height_; ++h) { + for (int w = 0; w < width_; ++w) { + data[h * stride + w] = rnd_.Rand8(); + } + } + } + + void CheckSad(unsigned int max_sad) { + unsigned int reference_sad, exp_sad; + + reference_sad = ReferenceSAD(max_sad); + exp_sad = SAD(max_sad); + + if (reference_sad <= max_sad) { + ASSERT_EQ(exp_sad, reference_sad); + } else { + // Alternative implementations are not required to check max_sad + ASSERT_GE(exp_sad, reference_sad); + } + } + + // Handle blocks up to 16x16 with stride up to 32 + int height_, width_; + static uint8_t* source_data_; + int source_stride_; + static uint8_t* reference_data_; + int reference_stride_; + + ACMRandom rnd_; +}; + +uint8_t* SADTest::source_data_ = NULL; +uint8_t* SADTest::reference_data_ = NULL; + +TEST_P(SADTest, MaxRef) { + FillConstant(source_data_, source_stride_, 0); + FillConstant(reference_data_, reference_stride_, 255); + CheckSad(UINT_MAX); +} + +TEST_P(SADTest, MaxSrc) { + FillConstant(source_data_, source_stride_, 255); + FillConstant(reference_data_, reference_stride_, 0); + CheckSad(UINT_MAX); +} + +TEST_P(SADTest, ShortRef) { + int tmp_stride = reference_stride_; + reference_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + CheckSad(UINT_MAX); + reference_stride_ = tmp_stride; +} + +TEST_P(SADTest, UnalignedRef) { + // The reference frame, but not the source frame, may be unaligned for + // certain types of searches. + int tmp_stride = reference_stride_; + reference_stride_ -= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + CheckSad(UINT_MAX); + reference_stride_ = tmp_stride; +} + +TEST_P(SADTest, ShortSrc) { + int tmp_stride = source_stride_; + source_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + CheckSad(UINT_MAX); + source_stride_ = tmp_stride; +} + +TEST_P(SADTest, MaxSAD) { + // Verify that, when max_sad is set, the implementation does not return a + // value lower than the reference. + FillConstant(source_data_, source_stride_, 255); + FillConstant(reference_data_, reference_stride_, 0); + CheckSad(128); +} + +using std::tr1::make_tuple; + +const sad_m_by_n_fn_t sad_16x16_c = vp8_sad16x16_c; +const sad_m_by_n_fn_t sad_8x16_c = vp8_sad8x16_c; +const sad_m_by_n_fn_t sad_16x8_c = vp8_sad16x8_c; +const sad_m_by_n_fn_t sad_8x8_c = vp8_sad8x8_c; +const sad_m_by_n_fn_t sad_4x4_c = vp8_sad4x4_c; +INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::Values( + make_tuple(16, 16, sad_16x16_c), + make_tuple(8, 16, sad_8x16_c), + make_tuple(16, 8, sad_16x8_c), + make_tuple(8, 8, sad_8x8_c), + make_tuple(4, 4, sad_4x4_c))); + +// ARM tests +#if HAVE_MEDIA +const sad_m_by_n_fn_t sad_16x16_armv6 = vp8_sad16x16_armv6; +INSTANTIATE_TEST_CASE_P(MEDIA, SADTest, ::testing::Values( + make_tuple(16, 16, sad_16x16_armv6))); + +#endif +#if HAVE_NEON +const sad_m_by_n_fn_t sad_16x16_neon = vp8_sad16x16_neon; +const sad_m_by_n_fn_t sad_8x16_neon = vp8_sad8x16_neon; +const sad_m_by_n_fn_t sad_16x8_neon = vp8_sad16x8_neon; +const sad_m_by_n_fn_t sad_8x8_neon = vp8_sad8x8_neon; +const sad_m_by_n_fn_t sad_4x4_neon = vp8_sad4x4_neon; +INSTANTIATE_TEST_CASE_P(NEON, SADTest, ::testing::Values( + make_tuple(16, 16, sad_16x16_neon), + make_tuple(8, 16, sad_8x16_neon), + make_tuple(16, 8, sad_16x8_neon), + make_tuple(8, 8, sad_8x8_neon), + make_tuple(4, 4, sad_4x4_neon))); +#endif + +// X86 tests +#if HAVE_MMX +const sad_m_by_n_fn_t sad_16x16_mmx = vp8_sad16x16_mmx; +const sad_m_by_n_fn_t sad_8x16_mmx = vp8_sad8x16_mmx; +const sad_m_by_n_fn_t sad_16x8_mmx = vp8_sad16x8_mmx; +const sad_m_by_n_fn_t sad_8x8_mmx = vp8_sad8x8_mmx; +const sad_m_by_n_fn_t sad_4x4_mmx = vp8_sad4x4_mmx; +INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::Values( + make_tuple(16, 16, sad_16x16_mmx), + make_tuple(8, 16, sad_8x16_mmx), + make_tuple(16, 8, sad_16x8_mmx), + make_tuple(8, 8, sad_8x8_mmx), + make_tuple(4, 4, sad_4x4_mmx))); +#endif +#if HAVE_SSE2 +const sad_m_by_n_fn_t sad_16x16_wmt = vp8_sad16x16_wmt; +const sad_m_by_n_fn_t sad_8x16_wmt = vp8_sad8x16_wmt; +const sad_m_by_n_fn_t sad_16x8_wmt = vp8_sad16x8_wmt; +const sad_m_by_n_fn_t sad_8x8_wmt = vp8_sad8x8_wmt; +const sad_m_by_n_fn_t sad_4x4_wmt = vp8_sad4x4_wmt; +INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::Values( + make_tuple(16, 16, sad_16x16_wmt), + make_tuple(8, 16, sad_8x16_wmt), + make_tuple(16, 8, sad_16x8_wmt), + make_tuple(8, 8, sad_8x8_wmt), + make_tuple(4, 4, sad_4x4_wmt))); +#endif +#if HAVE_SSSE3 +const sad_m_by_n_fn_t sad_16x16_sse3 = vp8_sad16x16_sse3; +INSTANTIATE_TEST_CASE_P(SSE3, SADTest, ::testing::Values( + make_tuple(16, 16, sad_16x16_sse3))); +#endif + +} // namespace diff --git a/test/set_roi.cc b/test/set_roi.cc new file mode 100644 index 0000000000000000000000000000000000000000..3b6112efbe57c36f6d8f27f520f68e55fd1b3279 --- /dev/null +++ b/test/set_roi.cc @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include +#include +#include +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "vpx/vpx_integer.h" +#include "vpx_mem/vpx_mem.h" +extern "C" { +#include "vp8/encoder/onyx_int.h" +} + +namespace { + +TEST(Vp8RoiMapTest, ParameterCheck) { + int delta_q[MAX_MB_SEGMENTS] = { -2, -25, 0, 31 }; + int delta_lf[MAX_MB_SEGMENTS] = { -2, -25, 0, 31 }; + unsigned int threshold[MAX_MB_SEGMENTS] = { 0, 100, 200, 300 }; + + const int internalq_trans[] = { + 0, 1, 2, 3, 4, 5, 7, 8, + 9, 10, 12, 13, 15, 17, 18, 19, + 20, 21, 23, 24, 25, 26, 27, 28, + 29, 30, 31, 33, 35, 37, 39, 41, + 43, 45, 47, 49, 51, 53, 55, 57, + 59, 61, 64, 67, 70, 73, 76, 79, + 82, 85, 88, 91, 94, 97, 100, 103, + 106, 109, 112, 115, 118, 121, 124, 127, + }; + + // Initialize elements of cpi with valid defaults. + VP8_COMP cpi; + cpi.mb.e_mbd.mb_segement_abs_delta = SEGMENT_DELTADATA; + cpi.cyclic_refresh_mode_enabled = 0; + cpi.mb.e_mbd.segmentation_enabled = 0; + cpi.mb.e_mbd.update_mb_segmentation_map = 0; + cpi.mb.e_mbd.update_mb_segmentation_data = 0; + cpi.common.mb_rows = 240 >> 4; + cpi.common.mb_cols = 320 >> 4; + const int mbs = (cpi.common.mb_rows * cpi.common.mb_cols); + vpx_memset(cpi.segment_feature_data, 0, sizeof(cpi.segment_feature_data)); + + // Segment map + cpi.segmentation_map = reinterpret_cast(vpx_calloc(mbs, 1)); + + // Allocate memory for the source memory map. + unsigned char *roi_map = + reinterpret_cast(vpx_calloc(mbs, 1)); + vpx_memset(&roi_map[mbs >> 2], 1, (mbs >> 2)); + vpx_memset(&roi_map[mbs >> 1], 2, (mbs >> 2)); + vpx_memset(&roi_map[mbs -(mbs >> 2)], 3, (mbs >> 2)); + + // Do a test call with valid parameters. + int roi_retval = vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows, + cpi.common.mb_cols, delta_q, delta_lf, + threshold); + EXPECT_EQ(0, roi_retval) + << "vp8_set_roimap roi failed with default test parameters"; + + // Check that the values in the cpi structure get set as expected. + if (roi_retval == 0) { + // Check that the segment map got set. + const int mapcompare = memcmp(roi_map, cpi.segmentation_map, mbs); + EXPECT_EQ(0, mapcompare) << "segment map error"; + + // Check the q deltas (note the need to translate into + // the interanl range of 0-127. + for (int i = 0; i < MAX_MB_SEGMENTS; ++i) { + const int transq = internalq_trans[abs(delta_q[i])]; + if (abs(cpi.segment_feature_data[MB_LVL_ALT_Q][i]) != transq) { + EXPECT_EQ(transq, cpi.segment_feature_data[MB_LVL_ALT_Q][i]) + << "segment delta_q error"; + break; + } + } + + // Check the loop filter deltas + for (int i = 0; i < MAX_MB_SEGMENTS; ++i) { + if (cpi.segment_feature_data[MB_LVL_ALT_LF][i] != delta_lf[i]) { + EXPECT_EQ(delta_lf[i], cpi.segment_feature_data[MB_LVL_ALT_LF][i]) + << "segment delta_lf error"; + break; + } + } + + // Check the breakout thresholds + for (int i = 0; i < MAX_MB_SEGMENTS; ++i) { + unsigned int breakout = + static_cast(cpi.segment_encode_breakout[i]); + + if (threshold[i] != breakout) { + EXPECT_EQ(threshold[i], breakout) + << "breakout threshold error"; + break; + } + } + + // Segmentation, and segmentation update flages should be set. + EXPECT_EQ(1, cpi.mb.e_mbd.segmentation_enabled) + << "segmentation_enabled error"; + EXPECT_EQ(1, cpi.mb.e_mbd.update_mb_segmentation_map) + << "update_mb_segmentation_map error"; + EXPECT_EQ(1, cpi.mb.e_mbd.update_mb_segmentation_data) + << "update_mb_segmentation_data error"; + + + // Try a range of delta q and lf parameters (some legal, some not) + for (int i = 0; i < 1000; ++i) { + int rand_deltas[4]; + int deltas_valid; + rand_deltas[0] = (rand() % 160) - 80; + rand_deltas[1] = (rand() % 160) - 80; + rand_deltas[2] = (rand() % 160) - 80; + rand_deltas[3] = (rand() % 160) - 80; + + deltas_valid = ((abs(rand_deltas[0]) <= 63) && + (abs(rand_deltas[1]) <= 63) && + (abs(rand_deltas[2]) <= 63) && + (abs(rand_deltas[3]) <= 63)) ? 0 : -1; + + // Test with random delta q values. + roi_retval = vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows, + cpi.common.mb_cols, rand_deltas, + delta_lf, threshold); + EXPECT_EQ(deltas_valid, roi_retval) << "dq range check error"; + + // One delta_q error shown at a time + if (deltas_valid != roi_retval) + break; + + // Test with random loop filter values. + roi_retval = vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows, + cpi.common.mb_cols, delta_q, + rand_deltas, threshold); + EXPECT_EQ(deltas_valid, roi_retval) << "dlf range check error"; + + // One delta loop filter error shown at a time + if (deltas_valid != roi_retval) + break; + } + + // Test that we report and error if cyclic refresh is enabled. + cpi.cyclic_refresh_mode_enabled = 1; + roi_retval = vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows, + cpi.common.mb_cols, delta_q, + delta_lf, threshold); + EXPECT_EQ(-1, roi_retval) << "cyclic refresh check error"; + cpi.cyclic_refresh_mode_enabled = 0; + + // Test invalid number of rows or colums. + roi_retval = vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows + 1, + cpi.common.mb_cols, delta_q, + delta_lf, threshold); + EXPECT_EQ(-1, roi_retval) << "MB rows bounds check error"; + + roi_retval = vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows, + cpi.common.mb_cols - 1, delta_q, + delta_lf, threshold); + EXPECT_EQ(-1, roi_retval) << "MB cols bounds check error"; + } + + // Free allocated memory + if (cpi.segmentation_map) + vpx_free(cpi.segmentation_map); + if (roi_map) + vpx_free(roi_map); +}; + +} // namespace diff --git a/test/sixtap_predict_test.cc b/test/sixtap_predict_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..06f14a1c74586414adcce7d62370cb1327d01556 --- /dev/null +++ b/test/sixtap_predict_test.cc @@ -0,0 +1,222 @@ +/* +* Copyright (c) 2012 The WebM project authors. All Rights Reserved. +* +* Use of this source code is governed by a BSD-style license +* that can be found in the LICENSE file in the root of the source +* tree. An additional intellectual property rights grant can be found +* in the file PATENTS. All contributing project authors may +* be found in the AUTHORS file in the root of the source tree. +*/ + +#include +#include +#include +#include "test/acm_random.h" +#include "test/util.h" +#include "third_party/googletest/src/include/gtest/gtest.h" +extern "C" { +#include "./vpx_config.h" +#include "./vpx_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_mem/vpx_mem.h" +} + +namespace { + +typedef void (*sixtap_predict_fn_t)(uint8_t *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + uint8_t *dst_ptr, + int dst_pitch); + +class SixtapPredictTest : public PARAMS(int, int, sixtap_predict_fn_t) { + public: + static void SetUpTestCase() { + src_ = reinterpret_cast(vpx_memalign(kDataAlignment, kSrcSize)); + dst_ = reinterpret_cast(vpx_memalign(kDataAlignment, kDstSize)); + dst_c_ = reinterpret_cast(vpx_memalign(kDataAlignment, kDstSize)); + } + + static void TearDownTestCase() { + vpx_free(src_); + src_ = NULL; + vpx_free(dst_); + dst_ = NULL; + vpx_free(dst_c_); + dst_c_ = NULL; + } + + protected: + // Make test arrays big enough for 16x16 functions. Six-tap filters + // need 5 extra pixels outside of the macroblock. + static const int kSrcStride = 21; + static const int kDstStride = 16; + static const int kDataAlignment = 16; + static const int kSrcSize = kSrcStride * kSrcStride + 1; + static const int kDstSize = kDstStride * kDstStride; + + virtual void SetUp() { + width_ = GET_PARAM(0); + height_ = GET_PARAM(1); + sixtap_predict_ = GET_PARAM(2); + memset(src_, 0, sizeof(src_)); + memset(dst_, 0, sizeof(dst_)); + memset(dst_c_, 0, sizeof(dst_c_)); + } + + int width_; + int height_; + sixtap_predict_fn_t sixtap_predict_; + // The src stores the macroblock we will filter on, and makes it 1 byte larger + // in order to test unaligned access. The result is stored in dst and dst_c(c + // reference code result). + static uint8_t* src_; + static uint8_t* dst_; + static uint8_t* dst_c_; +}; + +uint8_t* SixtapPredictTest::src_ = NULL; +uint8_t* SixtapPredictTest::dst_ = NULL; +uint8_t* SixtapPredictTest::dst_c_ = NULL; + +TEST_P(SixtapPredictTest, TestWithPresetData) { + // Test input + static const uint8_t test_data[kSrcSize] = { + 216, 184, 4, 191, 82, 92, 41, 0, 1, 226, 236, 172, 20, 182, 42, 226, 177, + 79, 94, 77, 179, 203, 206, 198, 22, 192, 19, 75, 17, 192, 44, 233, 120, + 48, 168, 203, 141, 210, 203, 143, 180, 184, 59, 201, 110, 102, 171, 32, + 182, 10, 109, 105, 213, 60, 47, 236, 253, 67, 55, 14, 3, 99, 247, 124, + 148, 159, 71, 34, 114, 19, 177, 38, 203, 237, 239, 58, 83, 155, 91, 10, + 166, 201, 115, 124, 5, 163, 104, 2, 231, 160, 16, 234, 4, 8, 103, 153, + 167, 174, 187, 26, 193, 109, 64, 141, 90, 48, 200, 174, 204, 36, 184, + 114, 237, 43, 238, 242, 207, 86, 245, 182, 247, 6, 161, 251, 14, 8, 148, + 182, 182, 79, 208, 120, 188, 17, 6, 23, 65, 206, 197, 13, 242, 126, 128, + 224, 170, 110, 211, 121, 197, 200, 47, 188, 207, 208, 184, 221, 216, 76, + 148, 143, 156, 100, 8, 89, 117, 14, 112, 183, 221, 54, 197, 208, 180, 69, + 176, 94, 180, 131, 215, 121, 76, 7, 54, 28, 216, 238, 249, 176, 58, 142, + 64, 215, 242, 72, 49, 104, 87, 161, 32, 52, 216, 230, 4, 141, 44, 181, + 235, 224, 57, 195, 89, 134, 203, 144, 162, 163, 126, 156, 84, 185, 42, + 148, 145, 29, 221, 194, 134, 52, 100, 166, 105, 60, 140, 110, 201, 184, + 35, 181, 153, 93, 121, 243, 227, 68, 131, 134, 232, 2, 35, 60, 187, 77, + 209, 76, 106, 174, 15, 241, 227, 115, 151, 77, 175, 36, 187, 121, 221, + 223, 47, 118, 61, 168, 105, 32, 237, 236, 167, 213, 238, 202, 17, 170, + 24, 226, 247, 131, 145, 6, 116, 117, 121, 11, 194, 41, 48, 126, 162, 13, + 93, 209, 131, 154, 122, 237, 187, 103, 217, 99, 60, 200, 45, 78, 115, 69, + 49, 106, 200, 194, 112, 60, 56, 234, 72, 251, 19, 120, 121, 182, 134, 215, + 135, 10, 114, 2, 247, 46, 105, 209, 145, 165, 153, 191, 243, 12, 5, 36, + 119, 206, 231, 231, 11, 32, 209, 83, 27, 229, 204, 149, 155, 83, 109, 35, + 93, 223, 37, 84, 14, 142, 37, 160, 52, 191, 96, 40, 204, 101, 77, 67, 52, + 53, 43, 63, 85, 253, 147, 113, 226, 96, 6, 125, 179, 115, 161, 17, 83, + 198, 101, 98, 85, 139, 3, 137, 75, 99, 178, 23, 201, 255, 91, 253, 52, + 134, 60, 138, 131, 208, 251, 101, 48, 2, 227, 228, 118, 132, 245, 202, + 75, 91, 44, 160, 231, 47, 41, 50, 147, 220, 74, 92, 219, 165, 89, 16 + }; + + // Expected result + static const uint8_t expected_dst[kDstSize] = { + 117, 102, 74, 135, 42, 98, 175, 206, 70, 73, 222, 197, 50, 24, 39, 49, 38, + 105, 90, 47, 169, 40, 171, 215, 200, 73, 109, 141, 53, 85, 177, 164, 79, + 208, 124, 89, 212, 18, 81, 145, 151, 164, 217, 153, 91, 154, 102, 102, + 159, 75, 164, 152, 136, 51, 213, 219, 186, 116, 193, 224, 186, 36, 231, + 208, 84, 211, 155, 167, 35, 59, 42, 76, 216, 149, 73, 201, 78, 149, 184, + 100, 96, 196, 189, 198, 188, 235, 195, 117, 129, 120, 129, 49, 25, 133, + 113, 69, 221, 114, 70, 143, 99, 157, 108, 189, 140, 78, 6, 55, 65, 240, + 255, 245, 184, 72, 90, 100, 116, 131, 39, 60, 234, 167, 33, 160, 88, 185, + 200, 157, 159, 176, 127, 151, 138, 102, 168, 106, 170, 86, 82, 219, 189, + 76, 33, 115, 197, 106, 96, 198, 136, 97, 141, 237, 151, 98, 137, 191, + 185, 2, 57, 95, 142, 91, 255, 185, 97, 137, 76, 162, 94, 173, 131, 193, + 161, 81, 106, 72, 135, 222, 234, 137, 66, 137, 106, 243, 210, 147, 95, + 15, 137, 110, 85, 66, 16, 96, 167, 147, 150, 173, 203, 140, 118, 196, + 84, 147, 160, 19, 95, 101, 123, 74, 132, 202, 82, 166, 12, 131, 166, + 189, 170, 159, 85, 79, 66, 57, 152, 132, 203, 194, 0, 1, 56, 146, 180, + 224, 156, 28, 83, 181, 79, 76, 80, 46, 160, 175, 59, 106, 43, 87, 75, + 136, 85, 189, 46, 71, 200, 90 + }; + + uint8_t *src = const_cast(test_data); + + sixtap_predict_(&src[kSrcStride * 2 + 2 + 1], kSrcStride, + 2, 2, dst_, kDstStride); + + for (int i = 0; i < height_; ++i) + for (int j = 0; j < width_; ++j) + ASSERT_EQ(expected_dst[i * kDstStride + j], dst_[i * kDstStride + j]) + << "i==" << (i * width_ + j); +} + +using libvpx_test::ACMRandom; + +TEST_P(SixtapPredictTest, TestWithRandomData) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + for (int i = 0; i < kSrcSize; ++i) + src_[i] = rnd.Rand8(); + + // Run tests for all possible offsets. + for (int xoffset = 0; xoffset < 8; ++xoffset) { + for (int yoffset = 0; yoffset < 8; ++yoffset) { + // Call c reference function. + // Move start point to next pixel to test if the function reads + // unaligned data correctly. + vp8_sixtap_predict16x16_c(&src_[kSrcStride * 2 + 2 + 1], kSrcStride, + xoffset, yoffset, dst_c_, kDstStride); + + // Run test. + sixtap_predict_(&src_[kSrcStride * 2 + 2 + 1], kSrcStride, + xoffset, yoffset, dst_, kDstStride); + + for (int i = 0; i < height_; ++i) + for (int j = 0; j < width_; ++j) + ASSERT_EQ(dst_c_[i * kDstStride + j], dst_[i * kDstStride + j]) + << "i==" << (i * width_ + j); + } + } +} + +using std::tr1::make_tuple; + +const sixtap_predict_fn_t sixtap_16x16_c = vp8_sixtap_predict16x16_c; +const sixtap_predict_fn_t sixtap_8x8_c = vp8_sixtap_predict8x8_c; +const sixtap_predict_fn_t sixtap_8x4_c = vp8_sixtap_predict8x4_c; +const sixtap_predict_fn_t sixtap_4x4_c = vp8_sixtap_predict4x4_c; +INSTANTIATE_TEST_CASE_P( + C, SixtapPredictTest, ::testing::Values( + make_tuple(16, 16, sixtap_16x16_c), + make_tuple(8, 8, sixtap_8x8_c), + make_tuple(8, 4, sixtap_8x4_c), + make_tuple(4, 4, sixtap_4x4_c))); +#if HAVE_MMX +const sixtap_predict_fn_t sixtap_16x16_mmx = vp8_sixtap_predict16x16_mmx; +const sixtap_predict_fn_t sixtap_8x8_mmx = vp8_sixtap_predict8x8_mmx; +const sixtap_predict_fn_t sixtap_8x4_mmx = vp8_sixtap_predict8x4_mmx; +const sixtap_predict_fn_t sixtap_4x4_mmx = vp8_sixtap_predict4x4_mmx; +INSTANTIATE_TEST_CASE_P( + MMX, SixtapPredictTest, ::testing::Values( + make_tuple(16, 16, sixtap_16x16_mmx), + make_tuple(8, 8, sixtap_8x8_mmx), + make_tuple(8, 4, sixtap_8x4_mmx), + make_tuple(4, 4, sixtap_4x4_mmx))); +#endif +#if HAVE_SSE2 +const sixtap_predict_fn_t sixtap_16x16_sse2 = vp8_sixtap_predict16x16_sse2; +const sixtap_predict_fn_t sixtap_8x8_sse2 = vp8_sixtap_predict8x8_sse2; +const sixtap_predict_fn_t sixtap_8x4_sse2 = vp8_sixtap_predict8x4_sse2; +INSTANTIATE_TEST_CASE_P( + SSE2, SixtapPredictTest, ::testing::Values( + make_tuple(16, 16, sixtap_16x16_sse2), + make_tuple(8, 8, sixtap_8x8_sse2), + make_tuple(8, 4, sixtap_8x4_sse2))); +#endif +#if HAVE_SSSE3 +const sixtap_predict_fn_t sixtap_16x16_ssse3 = vp8_sixtap_predict16x16_ssse3; +const sixtap_predict_fn_t sixtap_8x8_ssse3 = vp8_sixtap_predict8x8_ssse3; +const sixtap_predict_fn_t sixtap_8x4_ssse3 = vp8_sixtap_predict8x4_ssse3; +const sixtap_predict_fn_t sixtap_4x4_ssse3 = vp8_sixtap_predict4x4_ssse3; +INSTANTIATE_TEST_CASE_P( + SSSE3, SixtapPredictTest, ::testing::Values( + make_tuple(16, 16, sixtap_16x16_ssse3), + make_tuple(8, 8, sixtap_8x8_ssse3), + make_tuple(8, 4, sixtap_8x4_ssse3), + make_tuple(4, 4, sixtap_4x4_ssse3))); +#endif +} // namespace diff --git a/test/subtract_test.cc b/test/subtract_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..99363de640b1c8b91c08c35c4d3a441ab5b2db0b --- /dev/null +++ b/test/subtract_test.cc @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/acm_random.h" +extern "C" { +#include "vpx_config.h" +#include "vpx_rtcd.h" +#include "vp8/common/blockd.h" +#include "vp8/encoder/block.h" +#include "vpx_mem/vpx_mem.h" +} + +typedef void (*subtract_b_fn_t)(BLOCK *be, BLOCKD *bd, int pitch); + +namespace { + +class SubtractBlockTest : public ::testing::TestWithParam {}; + +using libvpx_test::ACMRandom; + +TEST_P(SubtractBlockTest, SimpleSubtract) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + BLOCK be; + BLOCKD bd; + // in libvpx, this stride is always 16 + const int kDiffPredStride = 16; + const int kSrcStride[] = {32, 16, 8, 4, 0}; + const int kBlockWidth = 4; + const int kBlockHeight = 4; + + // Allocate... align to 16 for mmx/sse tests + uint8_t *source = reinterpret_cast( + vpx_memalign(16, kBlockHeight * kSrcStride[0] * sizeof(*source))); + be.src_diff = reinterpret_cast( + vpx_memalign(16, kBlockHeight * kDiffPredStride * sizeof(*be.src_diff))); + bd.predictor = reinterpret_cast( + vpx_memalign(16, kBlockHeight * kDiffPredStride * sizeof(*bd.predictor))); + + for(int i = 0; kSrcStride[i] > 0; ++i) { + // start at block0 + be.src = 0; + be.base_src = &source; + be.src_stride = kSrcStride[i]; + + // set difference + int16_t *src_diff = be.src_diff; + for (int r = 0; r < kBlockHeight; ++r) { + for (int c = 0; c < kBlockWidth; ++c) { + src_diff[c] = 0xa5a5; + } + src_diff += kDiffPredStride; + } + + // set destination + uint8_t *base_src = *be.base_src; + for (int r = 0; r < kBlockHeight; ++r) { + for (int c = 0; c < kBlockWidth; ++c) { + base_src[c] = rnd.Rand8(); + } + base_src += be.src_stride; + } + + // set predictor + uint8_t *predictor = bd.predictor; + for (int r = 0; r < kBlockHeight; ++r) { + for (int c = 0; c < kBlockWidth; ++c) { + predictor[c] = rnd.Rand8(); + } + predictor += kDiffPredStride; + } + + GetParam()(&be, &bd, kDiffPredStride); + + base_src = *be.base_src; + src_diff = be.src_diff; + predictor = bd.predictor; + for (int r = 0; r < kBlockHeight; ++r) { + for (int c = 0; c < kBlockWidth; ++c) { + EXPECT_EQ(base_src[c], (src_diff[c] + predictor[c])) << "r = " << r + << ", c = " << c; + } + src_diff += kDiffPredStride; + predictor += kDiffPredStride; + base_src += be.src_stride; + } + } + vpx_free(be.src_diff); + vpx_free(source); + vpx_free(bd.predictor); +} + +INSTANTIATE_TEST_CASE_P(C, SubtractBlockTest, + ::testing::Values(vp8_subtract_b_c)); + +#if HAVE_MMX +INSTANTIATE_TEST_CASE_P(MMX, SubtractBlockTest, + ::testing::Values(vp8_subtract_b_mmx)); +#endif + +#if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P(SSE2, SubtractBlockTest, + ::testing::Values(vp8_subtract_b_sse2)); +#endif + +} // namespace diff --git a/test/test-data.sha1 b/test/test-data.sha1 index 8d40242b875a8a7119d7e41f17d28fe6a4607ff6..c1b6a834c31aac379af4a3bbc02815c278852fd9 100644 --- a/test/test-data.sha1 +++ b/test/test-data.sha1 @@ -1 +1,123 @@ d5dfb0151c9051f8c85999255645d7a23916d3c0 hantro_collage_w352h288.yuv +5184c46ddca8b1fadd16742e8500115bc8f749da vp80-00-comprehensive-001.ivf +65bf1bbbced81b97bd030f376d1b7f61a224793f vp80-00-comprehensive-002.ivf +906b4c1e99eb734504c504b3f1ad8052137ce672 vp80-00-comprehensive-003.ivf +ec144b1af53af895db78355785650b96dd3f0ade vp80-00-comprehensive-004.ivf +afc7091785c62f1c121c4554a2830c30704587d9 vp80-00-comprehensive-005.ivf +42ea9d55c818145d06a9b633b8e85c6a6164fd3e vp80-00-comprehensive-006.ivf +e5b3a73ab79fe024c14309d653d6bed92902ee3b vp80-00-comprehensive-007.ivf +f3c50a58875930adfb84525c0ef59d7e4c08540c vp80-00-comprehensive-008.ivf +4b2841fdb83db51ae322096ae468bbb9dc2c8362 vp80-00-comprehensive-009.ivf +efbff736e3a91ab6a98c5bc2dce65d645944c7b1 vp80-00-comprehensive-010.ivf +6b315102cae008d22a3d2c231be92cb704a222f8 vp80-00-comprehensive-011.ivf +f3214a4fea14c2d5ec689936c1613f274c859ee8 vp80-00-comprehensive-012.ivf +e4094e96d308c8a35b74c480a43d853c5294cd34 vp80-00-comprehensive-013.ivf +5b0adfaf60a69e0aaf3ec021a39d0a68fc0e1b5a vp80-00-comprehensive-014.ivf +e8467688ddf26b5000664f904faf0d70506aa653 vp80-00-comprehensive-015.ivf +aab55582337dfd2a39ff54fb2576a91910d49337 vp80-00-comprehensive-016.ivf +1ba24724f80203c9bae4f1d0f99d534721980016 vp80-00-comprehensive-017.ivf +143a15512b46f436280ddb4d0e6411eb4af434f2 vp80-00-comprehensive-018.ivf +c5baeaf5714fdfb3a8bc960a8e33ac438e83b16b vp80-01-intra-1400.ivf +f383955229afe3408453e316d11553d923ca60d5 vp80-01-intra-1411.ivf +84e1f4343f174c9f3c83f834bac3196fb325bf2c vp80-01-intra-1416.ivf +fb6e712a47dd57a28a3727d2ae2c97a8b7c7ca51 vp80-01-intra-1417.ivf +71ea772d3e9d315b8cbecf41207b8a237c34853b vp80-02-inter-1402.ivf +d85dbc4271525dcd128c503f936fe69091d1f8d0 vp80-02-inter-1412.ivf +d4e5d3ad56511867d025f93724d090f92ba6ec3d vp80-02-inter-1418.ivf +91791cbcc37c60f35dbd8090bacb54e5ec6dd4fa vp80-02-inter-1424.ivf +17fbfe2fea70f6e2f3fa6ca4efaae6c0b03b5f02 vp80-03-segmentation-01.ivf +3c3600dbbcde08e20d54c66fe3b7eadd4f09bdbb vp80-03-segmentation-02.ivf +c156778d5340967d4b369c490848076e92f1f875 vp80-03-segmentation-03.ivf +d25dcff6c60e87a1af70945b8911b6b4998533b0 vp80-03-segmentation-04.ivf +362baba2ce454c9db21218f35e81c27a5ed0b730 vp80-03-segmentation-1401.ivf +d223ae7ee748ce07e74c4679bfd219e84aa9f4b0 vp80-03-segmentation-1403.ivf +033adf7f3a13836a3f1cffcb87c1972900f2b5c6 vp80-03-segmentation-1407.ivf +4d51dfbf9f3e2c590ec99d1d6f59dd731d04375f vp80-03-segmentation-1408.ivf +f37a62b197c2600d75e0ccfbb31b60efdedac251 vp80-03-segmentation-1409.ivf +eb25bd7bfba5b2f6935018a930f42d123b1e7fcd vp80-03-segmentation-1410.ivf +b9d5c436663a30c27cfff84b53a002e501258843 vp80-03-segmentation-1413.ivf +6da92b9d1a180cc3a8afe348ab12258f5a37be1a vp80-03-segmentation-1414.ivf +a4f5842602886bd669f115f93d8a35c035cb0948 vp80-03-segmentation-1415.ivf +f295dceb8ef278b77251b3f9df8aee22e161d547 vp80-03-segmentation-1425.ivf +198dbf9f36f733200e432664cc8c5752d59779de vp80-03-segmentation-1426.ivf +7704804e32f5de976803929934a7fafe101ac7b0 vp80-03-segmentation-1427.ivf +831ccd862ea95ca025d2f3bd8b88678752f5416d vp80-03-segmentation-1432.ivf +b3c11978529289f9109f2766fcaba3ebc40e11ef vp80-03-segmentation-1435.ivf +a835a731f5520ebfc1002c40121264d0020559ac vp80-03-segmentation-1436.ivf +1d1732942f773bb2a5775fcb9689b1579ce28eab vp80-03-segmentation-1437.ivf +db04799adfe089dfdf74dbd43cc05ede7161f99e vp80-03-segmentation-1441.ivf +7caf39b3f20cfd52b998210878062e52a5edf1e6 vp80-03-segmentation-1442.ivf +3607f6bb4ee106c38fa1ea370dc4ff8b8cde2261 vp80-04-partitions-1404.ivf +93cc323b6b6867f1b12dd48773424549c6960a6b vp80-04-partitions-1405.ivf +047eedb14b865bdac8a3538e63801054e0295e9c vp80-04-partitions-1406.ivf +0f1233bd2bc33f56ce5e495dbd455d122339f384 vp80-05-sharpness-1428.ivf +51767fc136488a9535c2a4c38067c542ee2048df vp80-05-sharpness-1429.ivf +9805aa107672de25d6fb8c35e20d06deca5efe18 vp80-05-sharpness-1430.ivf +61db6b965f9c27aebe71b85bf2d5877e58e4bbdf vp80-05-sharpness-1431.ivf +10420d266290d2923555f84af38eeb96edbd3ae8 vp80-05-sharpness-1433.ivf +3ed24f9a80cddfdf75824ba95cdb4ff9286cb443 vp80-05-sharpness-1434.ivf +c87599cbecd72d4cd4f7ace3313b7a6bc6eb8163 vp80-05-sharpness-1438.ivf +aff51d865c2621b60510459244ea83e958e4baed vp80-05-sharpness-1439.ivf +da386e72b19b5485a6af199c5eb60ef25e510dd1 vp80-05-sharpness-1440.ivf +6759a095203d96ccd267ce09b1b050b8cc4c2f1f vp80-05-sharpness-1443.ivf +db55ec7fd02c864ba996ff060b25b1e08611330b vp80-00-comprehensive-001.ivf.md5 +29db0ad011cba1e45f856d5623cd38dac3e3bf19 vp80-00-comprehensive-002.ivf.md5 +e84f258f69e173e7d68f8f8c037a0a3766902182 vp80-00-comprehensive-003.ivf.md5 +eb7912eaf69559a16fd82bc3f5fb1524cf4a4466 vp80-00-comprehensive-004.ivf.md5 +4206f71c94894bd5b5b376f6c09b3817dbc65206 vp80-00-comprehensive-005.ivf.md5 +4f89b356f6f2fecb928f330a10f804f00f5325f5 vp80-00-comprehensive-006.ivf.md5 +2813236a32964dd8007e17648bcf035a20fcda6c vp80-00-comprehensive-007.ivf.md5 +10746c72098f872803c900e17c5680e451f5f498 vp80-00-comprehensive-008.ivf.md5 +39a23d0692ce64421a7bb7cdf6ccec5928d37fff vp80-00-comprehensive-009.ivf.md5 +f6e3de8931a0cc659bda8fbc14050346955e72d4 vp80-00-comprehensive-010.ivf.md5 +101683ec195b6e944f7cd1e468fc8921439363e6 vp80-00-comprehensive-011.ivf.md5 +1f592751ce46d8688998fa0fa4fbdcda0fd4058c vp80-00-comprehensive-012.ivf.md5 +6066176f90ca790251e795fca1a5797d59999841 vp80-00-comprehensive-013.ivf.md5 +2656da94ba93691f23edc4d60b3a09e2be46c217 vp80-00-comprehensive-014.ivf.md5 +c6e0d5f5d61460c8ac8edfa4e701f10312c03133 vp80-00-comprehensive-015.ivf.md5 +ee60fee501d8493e34e8d6a1fe315b51ed09b24a vp80-00-comprehensive-016.ivf.md5 +9f1914ceffcad4546c0a29de3ef591d8bea304dc vp80-00-comprehensive-017.ivf.md5 +e0305178fe288a9fd8082b39e2d03181edb19054 vp80-00-comprehensive-018.ivf.md5 +612494da2fa799cc9d76dcdd835ae6c7cb2e5c05 vp80-01-intra-1400.ivf.md5 +48ea06097ac8269c5e8c2131d3d0639f431fcf0e vp80-01-intra-1411.ivf.md5 +6e2ab4e7677ad0ba868083ca6bc387ee922b400c vp80-01-intra-1416.ivf.md5 +eca0a90348959ce3854142f8d8641b13050e8349 vp80-01-intra-1417.ivf.md5 +920feea203145d5c2258a91c4e6991934a79a99e vp80-02-inter-1402.ivf.md5 +f71d97909fe2b3dd65be7e1f56c72237f0cef200 vp80-02-inter-1412.ivf.md5 +e911254569a30bbb2a237ff8b79f69ed9da0672d vp80-02-inter-1418.ivf.md5 +58c789c50c9bb9cc90580bed291164a0939d28ba vp80-02-inter-1424.ivf.md5 +ff3e2f441327b9c20a0b37c524e0f5a48a36de7b vp80-03-segmentation-01.ivf.md5 +0791f417f076a542ae66fbc3426ab4d94cbd6c75 vp80-03-segmentation-02.ivf.md5 +722e50f1a6a91c34302d68681faffc1c26d1cc57 vp80-03-segmentation-03.ivf.md5 +c701f1885bcfb27fb8e70cc65606b289172ef889 vp80-03-segmentation-04.ivf.md5 +f79bc9ec189a2b4807632a3d0c5bf04a178b5300 vp80-03-segmentation-1401.ivf.md5 +b9aa4c74c0219b639811c44760d0b24cd8bb436a vp80-03-segmentation-1403.ivf.md5 +70d5a2207ca1891bcaebd5cf6dd88ce8d57b4334 vp80-03-segmentation-1407.ivf.md5 +265f962ee781531f9a93b9309461316fd32b2a1d vp80-03-segmentation-1408.ivf.md5 +0c4ecbbd6dc042d30e626d951b65f460dd6cd563 vp80-03-segmentation-1409.ivf.md5 +cf779af36a937f06570a0fca9db64ba133451dee vp80-03-segmentation-1410.ivf.md5 +0e6c5036d51ab078842f133934926c598a9cff02 vp80-03-segmentation-1413.ivf.md5 +eb3930aaf229116c80d507516c34759c3f6cdf69 vp80-03-segmentation-1414.ivf.md5 +123d6c0f72ee87911c4ae7538e87b7d163b22d6c vp80-03-segmentation-1415.ivf.md5 +e70551d1a38920e097a5d8782390b79ecaeb7505 vp80-03-segmentation-1425.ivf.md5 +44e8f4117e46dbb302b2cfd81171cc1a1846e431 vp80-03-segmentation-1426.ivf.md5 +52636e54aee5f95bbace37021bd67de5db767e9a vp80-03-segmentation-1427.ivf.md5 +b1ad3eff20215c28e295b15ef3636ed926d59cba vp80-03-segmentation-1432.ivf.md5 +24c22a552fa28a90e5978f67f57181cc2d7546d7 vp80-03-segmentation-1435.ivf.md5 +96c49c390abfced18a7a8c9b9ea10af778e10edb vp80-03-segmentation-1436.ivf.md5 +f95eb6214571434f1f73ab7833b9ccdf47588020 vp80-03-segmentation-1437.ivf.md5 +1c0700ca27c9b0090a7747a4b0b4dc21d1843181 vp80-03-segmentation-1441.ivf.md5 +81d4f23ca32667ee958bae579c8f5e97ba72eb97 vp80-03-segmentation-1442.ivf.md5 +272efcef07a3a30fbca51bfd566063d8258ec0be vp80-04-partitions-1404.ivf.md5 +66ed219ab812ac801b256d35cf495d193d4cf478 vp80-04-partitions-1405.ivf.md5 +36083f37f56f502bd60ec5e07502ee9e6b8699b0 vp80-04-partitions-1406.ivf.md5 +6ca909bf168a64c09415626294665dc1be3d1973 vp80-05-sharpness-1428.ivf.md5 +1667d2ee2334e5fdea8a8a866f4ccf3cf76f033a vp80-05-sharpness-1429.ivf.md5 +71bcbe5357d36a19df5b07fbe3e27bffa8893f0a vp80-05-sharpness-1430.ivf.md5 +89a09b1dffce2d55770a89e58d9925c70ef79bf8 vp80-05-sharpness-1431.ivf.md5 +08444a18b4e6ba3450c0796dd728d48c399a2dc9 vp80-05-sharpness-1433.ivf.md5 +6d6223719a90c13e848aa2a8a6642098cdb5977a vp80-05-sharpness-1434.ivf.md5 +41d70bb5fa45bc88da1604a0af466930b8dd77b5 vp80-05-sharpness-1438.ivf.md5 +086c56378df81b6cee264d7540a7b8f2b405c7a4 vp80-05-sharpness-1439.ivf.md5 +d32dc2c4165eb266ea4c23c14a45459b363def32 vp80-05-sharpness-1440.ivf.md5 +8c69dc3d8e563f56ffab5ad1e400d9e689dd23df vp80-05-sharpness-1443.ivf.md5 \ No newline at end of file diff --git a/test/test.mk b/test/test.mk index 129c188626078d1f7415fa15806287df10db5642..3c6d44c972f04141f5fe1d0bceddccbf15add106 100644 --- a/test/test.mk +++ b/test/test.mk @@ -1,10 +1,186 @@ -LIBVPX_TEST_SRCS-yes += test.mk LIBVPX_TEST_SRCS-yes += acm_random.h -LIBVPX_TEST_SRCS-yes += boolcoder_test.cc -LIBVPX_TEST_SRCS-yes += dct16x16_test.cc -LIBVPX_TEST_SRCS-yes += fdct4x4_test.cc -LIBVPX_TEST_SRCS-yes += fdct8x8_test.cc -LIBVPX_TEST_SRCS-yes += idct8x8_test.cc +LIBVPX_TEST_SRCS-yes += test.mk LIBVPX_TEST_SRCS-yes += test_libvpx.cc +LIBVPX_TEST_SRCS-yes += util.h +LIBVPX_TEST_SRCS-yes += video_source.h + +## +## BLACK BOX TESTS +## +## Black box tests only use the public API. +## +LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += altref_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += config_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += cq_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += datarate_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += encode_test_driver.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += encode_test_driver.h +LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += error_resilience_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += i420_video_source.h +LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += resize_test.cc + +LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += ../md5_utils.h ../md5_utils.c +LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += decode_test_driver.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += decode_test_driver.h +LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += ivf_video_source.h +LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += test_vector_test.cc +## +## WHITE BOX TESTS +## +## Whitebox tests invoke functions not exposed via the public API. Certain +## shared library builds don't make these functions accessible. +## +ifeq ($(CONFIG_SHARED),) + +# These tests require both the encoder and decoder to be built. +ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),yesyes) +LIBVPX_TEST_SRCS-yes += boolcoder_test.cc +endif + +LIBVPX_TEST_SRCS-yes += idctllm_test.cc +LIBVPX_TEST_SRCS-yes += intrapred_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC) += pp_filter_test.cc +LIBVPX_TEST_SRCS-yes += sad_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc +LIBVPX_TEST_SRCS-yes += sixtap_predict_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += subtract_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc + +# VP9 tests +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc +ifneq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_DECODER),) +LIBVPX_TEST_SRCS-yes += idct8x8_test.cc +endif + +endif + -LIBVPX_TEST_DATA-yes += hantro_collage_w352h288.yuv +## +## TEST DATA +## +LIBVPX_TEST_DATA-$(CONFIG_VP8_ENCODER) += hantro_collage_w352h288.yuv +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-002.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-003.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-004.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-005.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-006.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-007.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-008.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-009.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-010.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-011.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-012.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-013.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-014.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-015.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-016.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-017.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-018.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1400.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1411.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1416.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1417.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1402.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1412.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1418.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1424.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-01.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-02.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-03.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-04.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1401.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1403.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1407.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1408.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1409.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1410.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1413.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1414.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1415.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1425.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1426.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1427.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1432.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1435.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1436.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1437.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1441.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1442.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1404.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1405.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1406.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1428.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1429.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1430.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1431.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1433.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1434.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1438.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1439.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1440.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1443.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-002.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-003.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-004.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-005.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-006.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-007.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-008.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-009.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-010.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-011.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-012.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-013.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-014.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-015.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-016.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-017.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-018.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1400.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1411.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1416.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1417.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1402.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1412.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1418.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1424.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1401.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1403.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1407.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1408.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1409.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1410.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1413.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1414.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1415.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1425.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1426.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1427.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1432.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1435.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1436.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1437.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1441.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1442.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-01.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-02.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-03.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-04.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1404.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1405.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1406.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1428.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1429.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1430.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1431.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1433.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1434.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1438.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1439.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1440.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1443.ivf.md5 diff --git a/test/test_libvpx.cc b/test/test_libvpx.cc index 924aa2e2e25b312bbccd197f1ef6dccb8158247b..2b9b0c21f7591742bf7888d1415f2c66024bd9c4 100644 --- a/test/test_libvpx.cc +++ b/test/test_libvpx.cc @@ -26,7 +26,7 @@ int main(int argc, char **argv) { ::testing::InitGoogleTest(&argc, argv); #if ARCH_X86 || ARCH_X86_64 - int simd_caps = x86_simd_caps(); + const int simd_caps = x86_simd_caps(); if (!(simd_caps & HAS_MMX)) append_gtest_filter(":-MMX/*"); if (!(simd_caps & HAS_SSE)) diff --git a/test/test_vector_test.cc b/test/test_vector_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..938457b2bc5293bca5b0a7513317609c347c4715 --- /dev/null +++ b/test/test_vector_test.cc @@ -0,0 +1,144 @@ +/* + Copyright (c) 2012 The WebM project authors. All Rights Reserved. + + Use of this source code is governed by a BSD-style license + that can be found in the LICENSE file in the root of the source + tree. An additional intellectual property rights grant can be found + in the file PATENTS. All contributing project authors may + be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/decode_test_driver.h" +#include "test/ivf_video_source.h" +extern "C" { +#include "./md5_utils.h" +#include "vpx_mem/vpx_mem.h" +} + +#if defined(_MSC_VER) +#define snprintf sprintf_s +#endif + +namespace { +// There are 61 test vectors in total. +const char *kTestVectors[] = { + "vp80-00-comprehensive-001.ivf", + "vp80-00-comprehensive-002.ivf", "vp80-00-comprehensive-003.ivf", + "vp80-00-comprehensive-004.ivf", "vp80-00-comprehensive-005.ivf", + "vp80-00-comprehensive-006.ivf", "vp80-00-comprehensive-007.ivf", + "vp80-00-comprehensive-008.ivf", "vp80-00-comprehensive-009.ivf", + "vp80-00-comprehensive-010.ivf", "vp80-00-comprehensive-011.ivf", + "vp80-00-comprehensive-012.ivf", "vp80-00-comprehensive-013.ivf", + "vp80-00-comprehensive-014.ivf", "vp80-00-comprehensive-015.ivf", + "vp80-00-comprehensive-016.ivf", "vp80-00-comprehensive-017.ivf", + "vp80-00-comprehensive-018.ivf", "vp80-01-intra-1400.ivf", + "vp80-01-intra-1411.ivf", "vp80-01-intra-1416.ivf", + "vp80-01-intra-1417.ivf", "vp80-02-inter-1402.ivf", + "vp80-02-inter-1412.ivf", "vp80-02-inter-1418.ivf", + "vp80-02-inter-1424.ivf", "vp80-03-segmentation-01.ivf", + "vp80-03-segmentation-02.ivf", "vp80-03-segmentation-03.ivf", + "vp80-03-segmentation-04.ivf", "vp80-03-segmentation-1401.ivf", + "vp80-03-segmentation-1403.ivf", "vp80-03-segmentation-1407.ivf", + "vp80-03-segmentation-1408.ivf", "vp80-03-segmentation-1409.ivf", + "vp80-03-segmentation-1410.ivf", "vp80-03-segmentation-1413.ivf", + "vp80-03-segmentation-1414.ivf", "vp80-03-segmentation-1415.ivf", + "vp80-03-segmentation-1425.ivf", "vp80-03-segmentation-1426.ivf", + "vp80-03-segmentation-1427.ivf", "vp80-03-segmentation-1432.ivf", + "vp80-03-segmentation-1435.ivf", "vp80-03-segmentation-1436.ivf", + "vp80-03-segmentation-1437.ivf", "vp80-03-segmentation-1441.ivf", + "vp80-03-segmentation-1442.ivf", "vp80-04-partitions-1404.ivf", + "vp80-04-partitions-1405.ivf", "vp80-04-partitions-1406.ivf", + "vp80-05-sharpness-1428.ivf", "vp80-05-sharpness-1429.ivf", + "vp80-05-sharpness-1430.ivf", "vp80-05-sharpness-1431.ivf", + "vp80-05-sharpness-1433.ivf", "vp80-05-sharpness-1434.ivf", + "vp80-05-sharpness-1438.ivf", "vp80-05-sharpness-1439.ivf", + "vp80-05-sharpness-1440.ivf", "vp80-05-sharpness-1443.ivf" +}; + +class TestVectorTest : public libvpx_test::DecoderTest, + public ::testing::TestWithParam { + protected: + TestVectorTest() : md5_file_(NULL) {} + + virtual ~TestVectorTest() { + if (md5_file_) + fclose(md5_file_); + } + + void OpenMD5File(const std::string& md5_file_name_) { + md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_); + ASSERT_TRUE(md5_file_) << "Md5 file open failed. Filename: " + << md5_file_name_; + } + + virtual void DecompressedFrameHook(const vpx_image_t& img, + const unsigned int frame_number) { + char expected_md5[33]; + char junk[128]; + + // Read correct md5 checksums. + const int res = fscanf(md5_file_, "%s %s", expected_md5, junk); + ASSERT_NE(res, EOF) << "Read md5 data failed"; + expected_md5[32] = '\0'; + + MD5Context md5; + MD5Init(&md5); + + // Compute and update md5 for each raw in decompressed data. + for (int plane = 0; plane < 3; ++plane) { + uint8_t *buf = img.planes[plane]; + + for (unsigned int y = 0; y < (plane ? (img.d_h + 1) >> 1 : img.d_h); + ++y) { + MD5Update(&md5, buf, (plane ? (img.d_w + 1) >> 1 : img.d_w)); + buf += img.stride[plane]; + } + } + + uint8_t md5_sum[16]; + MD5Final(md5_sum, &md5); + + char actual_md5[33]; + // Convert to get the actual md5. + for (int i = 0; i < 16; i++) { + snprintf(&actual_md5[i * 2], sizeof(actual_md5) - i * 2, "%02x", + md5_sum[i]); + } + actual_md5[32] = '\0'; + + // Check md5 match. + ASSERT_STREQ(expected_md5, actual_md5) + << "Md5 checksums don't match: frame number = " << frame_number; + } + + private: + FILE *md5_file_; +}; + +// This test runs through the whole set of test vectors, and decodes them. +// The md5 checksums are computed for each frame in the video file. If md5 +// checksums match the correct md5 data, then the test is passed. Otherwise, +// the test failed. +TEST_P(TestVectorTest, MD5Match) { + const std::string filename = GetParam(); + // Open compressed video file. + libvpx_test::IVFVideoSource video(filename); + + video.Init(); + + // Construct md5 file name. + const std::string md5_filename = filename + ".md5"; + OpenMD5File(md5_filename); + + // Decode frame, and check the md5 matching. + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +INSTANTIATE_TEST_CASE_P(TestVectorSequence, TestVectorTest, + ::testing::ValuesIn(kTestVectors)); + +} // namespace diff --git a/test/util.h b/test/util.h new file mode 100644 index 0000000000000000000000000000000000000000..06a70cc8e412d32dd9e98cce39606a6a912d1c82 --- /dev/null +++ b/test/util.h @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef TEST_UTIL_H_ +#define TEST_UTIL_H_ + +// Macros +#define PARAMS(...) ::testing::TestWithParam< std::tr1::tuple< __VA_ARGS__ > > +#define GET_PARAM(k) std::tr1::get< k >(GetParam()) + +#endif // TEST_UTIL_H_ diff --git a/test/video_source.h b/test/video_source.h new file mode 100644 index 0000000000000000000000000000000000000000..9772657d6861e8dbb437c4d499c97fafcef7cc33 --- /dev/null +++ b/test/video_source.h @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef TEST_VIDEO_SOURCE_H_ +#define TEST_VIDEO_SOURCE_H_ + +#include +#include +#include +#include "test/acm_random.h" +#include "vpx/vpx_encoder.h" + +namespace libvpx_test { + +static FILE *OpenTestDataFile(const std::string& file_name) { + std::string path_to_source = file_name; + const char *kDataPath = getenv("LIBVPX_TEST_DATA_PATH"); + + if (kDataPath) { + path_to_source = kDataPath; + path_to_source += "/"; + path_to_source += file_name; + } + + return fopen(path_to_source.c_str(), "rb"); +} + +// Abstract base class for test video sources, which provide a stream of +// vpx_image_t images with associated timestamps and duration. +class VideoSource { + public: + virtual ~VideoSource() {} + + // Prepare the stream for reading, rewind/open as necessary. + virtual void Begin() = 0; + + // Advance the cursor to the next frame + virtual void Next() = 0; + + // Get the current video frame, or NULL on End-Of-Stream. + virtual vpx_image_t *img() const = 0; + + // Get the presentation timestamp of the current frame. + virtual vpx_codec_pts_t pts() const = 0; + + // Get the current frame's duration + virtual unsigned long duration() const = 0; + + // Get the timebase for the stream + virtual vpx_rational_t timebase() const = 0; + + // Get the current frame counter, starting at 0. + virtual unsigned int frame() const = 0; + + // Get the current file limit. + virtual unsigned int limit() const = 0; +}; + + +class DummyVideoSource : public VideoSource { + public: + DummyVideoSource() : img_(NULL), limit_(100), width_(0), height_(0) { + SetSize(80, 64); + } + + virtual ~DummyVideoSource() { vpx_img_free(img_); } + + virtual void Begin() { + frame_ = 0; + FillFrame(); + } + + virtual void Next() { + ++frame_; + FillFrame(); + } + + virtual vpx_image_t *img() const { + return (frame_ < limit_) ? img_ : NULL; + } + + // Models a stream where Timebase = 1/FPS, so pts == frame. + virtual vpx_codec_pts_t pts() const { return frame_; } + + virtual unsigned long duration() const { return 1; } + + virtual vpx_rational_t timebase() const { + const vpx_rational_t t = {1, 30}; + return t; + } + + virtual unsigned int frame() const { return frame_; } + + virtual unsigned int limit() const { return limit_; } + + void SetSize(unsigned int width, unsigned int height) { + if (width != width_ || height != height_) { + vpx_img_free(img_); + raw_sz_ = ((width + 31)&~31) * height * 3 / 2; + img_ = vpx_img_alloc(NULL, VPX_IMG_FMT_VPXI420, width, height, 32); + width_ = width; + height_ = height; + } + } + + protected: + virtual void FillFrame() { memset(img_->img_data, 0, raw_sz_); } + + vpx_image_t *img_; + size_t raw_sz_; + unsigned int limit_; + unsigned int frame_; + unsigned int width_; + unsigned int height_; +}; + + +class RandomVideoSource : public DummyVideoSource { + public: + RandomVideoSource(int seed = ACMRandom::DeterministicSeed()) + : rnd_(seed), + seed_(seed) { } + + protected: + // Reset the RNG to get a matching stream for the second pass + virtual void Begin() { + frame_ = 0; + rnd_.Reset(seed_); + FillFrame(); + } + + // 15 frames of noise, followed by 15 static frames. Reset to 0 rather + // than holding previous frames to encourage keyframes to be thrown. + virtual void FillFrame() { + if (frame_ % 30 < 15) + for (size_t i = 0; i < raw_sz_; ++i) + img_->img_data[i] = rnd_.Rand8(); + else + memset(img_->img_data, 0, raw_sz_); + } + + ACMRandom rnd_; + int seed_; +}; + +// Abstract base class for test video sources, which provide a stream of +// decompressed images to the decoder. +class CompressedVideoSource { + public: + virtual ~CompressedVideoSource() {} + + virtual void Init() = 0; + + // Prepare the stream for reading, rewind/open as necessary. + virtual void Begin() = 0; + + // Advance the cursor to the next frame + virtual void Next() = 0; + + virtual const uint8_t *cxdata() const = 0; + + virtual const unsigned int frame_size() const = 0; + + virtual const unsigned int frame_number() const = 0; +}; + +} // namespace libvpx_test + +#endif // TEST_VIDEO_SOURCE_H_ diff --git a/test/vp8_fdct4x4_test.cc b/test/vp8_fdct4x4_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..619b23d2275fee8e1eee53b024c1755b94444908 --- /dev/null +++ b/test/vp8_fdct4x4_test.cc @@ -0,0 +1,169 @@ +/* +* Copyright (c) 2012 The WebM project authors. All Rights Reserved. +* +* Use of this source code is governed by a BSD-style license +* that can be found in the LICENSE file in the root of the source +* tree. An additional intellectual property rights grant can be found +* in the file PATENTS. All contributing project authors may +* be found in the AUTHORS file in the root of the source tree. +*/ + + +#include +#include +#include +#include +#include +#include + + +extern "C" { +#include "vpx_rtcd.h" +} + +#include "test/acm_random.h" +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "vpx/vpx_integer.h" + + +namespace { + +const int cospi8sqrt2minus1 = 20091; +const int sinpi8sqrt2 = 35468; + +void reference_idct4x4(const int16_t *input, int16_t *output) { + const int16_t *ip = input; + int16_t *op = output; + + for (int i = 0; i < 4; ++i) { + const int a1 = ip[0] + ip[8]; + const int b1 = ip[0] - ip[8]; + const int temp1 = (ip[4] * sinpi8sqrt2) >> 16; + const int temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16); + const int c1 = temp1 - temp2; + const int temp3 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16); + const int temp4 = (ip[12] * sinpi8sqrt2) >> 16; + const int d1 = temp3 + temp4; + op[0] = a1 + d1; + op[12] = a1 - d1; + op[4] = b1 + c1; + op[8] = b1 - c1; + ++ip; + ++op; + } + ip = output; + op = output; + for (int i = 0; i < 4; ++i) { + const int a1 = ip[0] + ip[2]; + const int b1 = ip[0] - ip[2]; + const int temp1 = (ip[1] * sinpi8sqrt2) >> 16; + const int temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16); + const int c1 = temp1 - temp2; + const int temp3 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16); + const int temp4 = (ip[3] * sinpi8sqrt2) >> 16; + const int d1 = temp3 + temp4; + op[0] = (a1 + d1 + 4) >> 3; + op[3] = (a1 - d1 + 4) >> 3; + op[1] = (b1 + c1 + 4) >> 3; + op[2] = (b1 - c1 + 4) >> 3; + ip += 4; + op += 4; + } +} + +using libvpx_test::ACMRandom; + +TEST(Vp8FdctTest, SignBiasCheck) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + int16_t test_input_block[16]; + int16_t test_output_block[16]; + const int pitch = 8; + int count_sign_block[16][2]; + const int count_test_block = 1000000; + + memset(count_sign_block, 0, sizeof(count_sign_block)); + + for (int i = 0; i < count_test_block; ++i) { + // Initialize a test block with input range [-255, 255]. + for (int j = 0; j < 16; ++j) + test_input_block[j] = rnd.Rand8() - rnd.Rand8(); + + vp8_short_fdct4x4_c(test_input_block, test_output_block, pitch); + + for (int j = 0; j < 16; ++j) { + if (test_output_block[j] < 0) + ++count_sign_block[j][0]; + else if (test_output_block[j] > 0) + ++count_sign_block[j][1]; + } + } + + bool bias_acceptable = true; + for (int j = 0; j < 16; ++j) + bias_acceptable = bias_acceptable && + (abs(count_sign_block[j][0] - count_sign_block[j][1]) < 10000); + + EXPECT_EQ(true, bias_acceptable) + << "Error: 4x4 FDCT has a sign bias > 1% for input range [-255, 255]"; + + memset(count_sign_block, 0, sizeof(count_sign_block)); + + for (int i = 0; i < count_test_block; ++i) { + // Initialize a test block with input range [-15, 15]. + for (int j = 0; j < 16; ++j) + test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4); + + vp8_short_fdct4x4_c(test_input_block, test_output_block, pitch); + + for (int j = 0; j < 16; ++j) { + if (test_output_block[j] < 0) + ++count_sign_block[j][0]; + else if (test_output_block[j] > 0) + ++count_sign_block[j][1]; + } + } + + bias_acceptable = true; + for (int j = 0; j < 16; ++j) + bias_acceptable = bias_acceptable && + (abs(count_sign_block[j][0] - count_sign_block[j][1]) < 100000); + + EXPECT_EQ(true, bias_acceptable) + << "Error: 4x4 FDCT has a sign bias > 10% for input range [-15, 15]"; +}; + +TEST(Vp8FdctTest, RoundTripErrorCheck) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + int max_error = 0; + double total_error = 0; + const int count_test_block = 1000000; + for (int i = 0; i < count_test_block; ++i) { + int16_t test_input_block[16]; + int16_t test_temp_block[16]; + int16_t test_output_block[16]; + + // Initialize a test block with input range [-255, 255]. + for (int j = 0; j < 16; ++j) + test_input_block[j] = rnd.Rand8() - rnd.Rand8(); + + const int pitch = 8; + vp8_short_fdct4x4_c(test_input_block, test_temp_block, pitch); + reference_idct4x4(test_temp_block, test_output_block); + + for (int j = 0; j < 16; ++j) { + const int diff = test_input_block[j] - test_output_block[j]; + const int error = diff * diff; + if (max_error < error) + max_error = error; + total_error += error; + } + } + + EXPECT_GE(1, max_error ) + << "Error: FDCT/IDCT has an individual roundtrip error > 1"; + + EXPECT_GE(count_test_block, total_error) + << "Error: FDCT/IDCT has average roundtrip error > 1 per block"; +}; + +} // namespace diff --git a/third_party/libyuv/source/scale.c b/third_party/libyuv/source/scale.c index 930a7ae09a43f7e9aa228f4a6176e97f0109b410..c142a17bb48d7f4683d6abc99575c99bee9efb67 100644 --- a/third_party/libyuv/source/scale.c +++ b/third_party/libyuv/source/scale.c @@ -60,7 +60,7 @@ void SetUseReferenceImpl(int use) { #if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) #define HAS_SCALEROWDOWN2_NEON -void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */, +void ScaleRowDown2_NEON(const uint8* src_ptr, int src_stride, uint8* dst, int dst_width) { asm volatile ( "1: \n" @@ -102,7 +102,7 @@ void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride, } #define HAS_SCALEROWDOWN4_NEON -static void ScaleRowDown4_NEON(const uint8* src_ptr, int /* src_stride */, +static void ScaleRowDown4_NEON(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( "1: \n" @@ -160,7 +160,7 @@ static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride, // Down scale from 4 to 3 pixels. Use the neon multilane read/write // to load up the every 4th pixel into a 4 different registers. // Point samples 32 pixels to 24 pixels. -static void ScaleRowDown34_NEON(const uint8* src_ptr, int /* src_stride */, +static void ScaleRowDown34_NEON(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( "1: \n" @@ -284,7 +284,7 @@ const unsigned short mult38_div9[8] __attribute__ ((aligned(16))) = 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; // 32 -> 12 -static void ScaleRowDown38_NEON(const uint8* src_ptr, int, +static void ScaleRowDown38_NEON(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( "vld1.u8 {q3}, [%3] \n" diff --git a/tools/all_builds.py b/tools/all_builds.py index d1f0c80c038af476cf03af5c0483a7434067b1af..78581d9f0d23824ce1982ca0fb2347b3e79ec2ef 100755 --- a/tools/all_builds.py +++ b/tools/all_builds.py @@ -5,7 +5,7 @@ import subprocess import sys LONG_OPTIONS = ["shard=", "shards="] -BASE_COMMAND = "./configure --enable-internal-stats --enable-experimental" +BASE_COMMAND = "./configure --disable-vp8 --disable-unit-tests --enable-internal-stats --enable-experimental" def RunCommand(command): run = subprocess.Popen(command, shell=True) diff --git a/usage.dox b/usage.dox index 0db080b008450b06d05ffbbc3f435baf37b258e0..92fd6b26e2dcc7537d25894baae3d8db532cfd60 100644 --- a/usage.dox +++ b/usage.dox @@ -1,6 +1,6 @@ /*!\page usage Usage - The vpx Multi-Format codec SDK provides a unified interface amongst its + The vpx multi-format codec SDK provides a unified interface amongst its supported codecs. This abstraction allows applications using this SDK to easily support multiple video formats with minimal code duplication or "special casing." This section describes the interface common to all codecs. @@ -14,8 +14,12 @@ Fore more information on decoder and encoder specific usage, see the following pages: - \if decoder - \subpage usage_decode \endif - \if decoder - \subpage usage_encode \endif + \if decoder + - \subpage usage_decode + \endif + \if decoder + - \subpage usage_encode + \endif \section usage_types Important Data Types There are two important data structures to consider in this interface. @@ -82,6 +86,7 @@ The available initialization methods are: \if encoder - #vpx_codec_enc_init (calls vpx_codec_enc_init_ver()) \endif + \if multi-encoder - #vpx_codec_enc_init_multi (calls vpx_codec_enc_init_multi_ver()) \endif \if decoder - #vpx_codec_dec_init (calls vpx_codec_dec_init_ver()) \endif diff --git a/usage_cx.dox b/usage_cx.dox index 980a03461293b9383646f516748a60090ddd731c..62f3e450b0e9681ec581cf9909151e88f2a5e272 100644 --- a/usage_cx.dox +++ b/usage_cx.dox @@ -1,6 +1,6 @@ /*! \page usage_encode Encode - The vpx_codec_encode() function is at the core of the decode loop. It + The vpx_codec_encode() function is at the core of the encode loop. It processes raw images passed by the application, producing packets of compressed data. The deadline parameter controls the amount of time in microseconds the encoder should spend working on the frame. For @@ -10,5 +10,4 @@ \ref samples - */ diff --git a/vp8/common/alloccommon.c b/vp8/common/alloccommon.c new file mode 100644 index 0000000000000000000000000000000000000000..8af9e904912c3105dba67090426ab99c617a5700 --- /dev/null +++ b/vp8/common/alloccommon.c @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "blockd.h" +#include "vpx_mem/vpx_mem.h" +#include "onyxc_int.h" +#include "findnearmv.h" +#include "entropymode.h" +#include "systemdependent.h" + +void vp8_de_alloc_frame_buffers(VP8_COMMON *oci) +{ + int i; + for (i = 0; i < NUM_YV12_BUFFERS; i++) + vp8_yv12_de_alloc_frame_buffer(&oci->yv12_fb[i]); + + vp8_yv12_de_alloc_frame_buffer(&oci->temp_scale_frame); +#if CONFIG_POSTPROC + vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer); + if (oci->post_proc_buffer_int_used) + vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer_int); + + vpx_free(oci->pp_limits_buffer); + oci->pp_limits_buffer = NULL; +#endif + + vpx_free(oci->above_context); + vpx_free(oci->mip); +#if CONFIG_ERROR_CONCEALMENT + vpx_free(oci->prev_mip); + oci->prev_mip = NULL; +#endif + + oci->above_context = NULL; + oci->mip = NULL; +} + +int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height) +{ + int i; + + vp8_de_alloc_frame_buffers(oci); + + /* our internal buffers are always multiples of 16 */ + if ((width & 0xf) != 0) + width += 16 - (width & 0xf); + + if ((height & 0xf) != 0) + height += 16 - (height & 0xf); + + + for (i = 0; i < NUM_YV12_BUFFERS; i++) + { + oci->fb_idx_ref_cnt[i] = 0; + oci->yv12_fb[i].flags = 0; + if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i], width, height, VP8BORDERINPIXELS) < 0) + goto allocation_fail; + } + + oci->new_fb_idx = 0; + oci->lst_fb_idx = 1; + oci->gld_fb_idx = 2; + oci->alt_fb_idx = 3; + + oci->fb_idx_ref_cnt[0] = 1; + oci->fb_idx_ref_cnt[1] = 1; + oci->fb_idx_ref_cnt[2] = 1; + oci->fb_idx_ref_cnt[3] = 1; + + if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame, width, 16, VP8BORDERINPIXELS) < 0) + goto allocation_fail; + + oci->mb_rows = height >> 4; + oci->mb_cols = width >> 4; + oci->MBs = oci->mb_rows * oci->mb_cols; + oci->mode_info_stride = oci->mb_cols + 1; + oci->mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO)); + + if (!oci->mip) + goto allocation_fail; + + oci->mi = oci->mip + oci->mode_info_stride + 1; + + /* Allocation of previous mode info will be done in vp8_decode_frame() + * as it is a decoder only data */ + + oci->above_context = vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * oci->mb_cols, 1); + + if (!oci->above_context) + goto allocation_fail; + +#if CONFIG_POSTPROC + if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer, width, height, VP8BORDERINPIXELS) < 0) + goto allocation_fail; + + oci->post_proc_buffer_int_used = 0; + vpx_memset(&oci->postproc_state, 0, sizeof(oci->postproc_state)); + vpx_memset(oci->post_proc_buffer.buffer_alloc, 128, + oci->post_proc_buffer.frame_size); + + /* Allocate buffer to store post-processing filter coefficients. + * + * Note: Round up mb_cols to support SIMD reads + */ + oci->pp_limits_buffer = vpx_memalign(16, 24 * ((oci->mb_cols + 1) & ~1)); + if (!oci->pp_limits_buffer) + goto allocation_fail; +#endif + + return 0; + +allocation_fail: + vp8_de_alloc_frame_buffers(oci); + return 1; +} + +void vp8_setup_version(VP8_COMMON *cm) +{ + switch (cm->version) + { + case 0: + cm->no_lpf = 0; + cm->filter_type = NORMAL_LOOPFILTER; + cm->use_bilinear_mc_filter = 0; + cm->full_pixel = 0; + break; + case 1: + cm->no_lpf = 0; + cm->filter_type = SIMPLE_LOOPFILTER; + cm->use_bilinear_mc_filter = 1; + cm->full_pixel = 0; + break; + case 2: + cm->no_lpf = 1; + cm->filter_type = NORMAL_LOOPFILTER; + cm->use_bilinear_mc_filter = 1; + cm->full_pixel = 0; + break; + case 3: + cm->no_lpf = 1; + cm->filter_type = SIMPLE_LOOPFILTER; + cm->use_bilinear_mc_filter = 1; + cm->full_pixel = 1; + break; + default: + /*4,5,6,7 are reserved for future use*/ + cm->no_lpf = 0; + cm->filter_type = NORMAL_LOOPFILTER; + cm->use_bilinear_mc_filter = 0; + cm->full_pixel = 0; + break; + } +} +void vp8_create_common(VP8_COMMON *oci) +{ + vp8_machine_specific_config(oci); + + vp8_init_mbmode_probs(oci); + vp8_default_bmode_probs(oci->fc.bmode_prob); + + oci->mb_no_coeff_skip = 1; + oci->no_lpf = 0; + oci->filter_type = NORMAL_LOOPFILTER; + oci->use_bilinear_mc_filter = 0; + oci->full_pixel = 0; + oci->multi_token_partition = ONE_PARTITION; + oci->clr_type = REG_YUV; + oci->clamp_type = RECON_CLAMP_REQUIRED; + + /* Initialize reference frame sign bias structure to defaults */ + vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias)); + + /* Default disable buffer to buffer copying */ + oci->copy_buffer_to_gf = 0; + oci->copy_buffer_to_arf = 0; +} + +void vp8_remove_common(VP8_COMMON *oci) +{ + vp8_de_alloc_frame_buffers(oci); +} diff --git a/vp8/common/alloccommon.h b/vp8/common/alloccommon.h new file mode 100644 index 0000000000000000000000000000000000000000..ea93c252280b305bcdee0799181c7d6947541c57 --- /dev/null +++ b/vp8/common/alloccommon.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef __INC_ALLOCCOMMON_H +#define __INC_ALLOCCOMMON_H + +#include "onyxc_int.h" + +void vp8_create_common(VP8_COMMON *oci); +void vp8_remove_common(VP8_COMMON *oci); +void vp8_de_alloc_frame_buffers(VP8_COMMON *oci); +int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height); +void vp8_setup_version(VP8_COMMON *oci); + +#endif diff --git a/vp8/common/arm/armv6/bilinearfilter_v6.asm b/vp8/common/arm/armv6/bilinearfilter_v6.asm new file mode 100644 index 0000000000000000000000000000000000000000..9704b42105b366b40cdbbe19b23b073f3ac8942f --- /dev/null +++ b/vp8/common/arm/armv6/bilinearfilter_v6.asm @@ -0,0 +1,237 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_filter_block2d_bil_first_pass_armv6| + EXPORT |vp8_filter_block2d_bil_second_pass_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code + +;------------------------------------- +; r0 unsigned char *src_ptr, +; r1 unsigned short *dst_ptr, +; r2 unsigned int src_pitch, +; r3 unsigned int height, +; stack unsigned int width, +; stack const short *vp8_filter +;------------------------------------- +; The output is transposed stroed in output array to make it easy for second pass filtering. +|vp8_filter_block2d_bil_first_pass_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; vp8_filter address + ldr r4, [sp, #36] ; width + + mov r12, r3 ; outer-loop counter + + add r7, r2, r4 ; preload next row + pld [r0, r7] + + sub r2, r2, r4 ; src increment for height loop + + ldr r5, [r11] ; load up filter coefficients + + mov r3, r3, lsl #1 ; height*2 + add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1) + + mov r11, r1 ; save dst_ptr for each row + + cmp r5, #128 ; if filter coef = 128, then skip the filter + beq bil_null_1st_filter + +|bil_height_loop_1st_v6| + ldrb r6, [r0] ; load source data + ldrb r7, [r0, #1] + ldrb r8, [r0, #2] + mov lr, r4, lsr #2 ; 4-in-parellel loop counter + +|bil_width_loop_1st_v6| + ldrb r9, [r0, #3] + ldrb r10, [r0, #4] + + pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0] + pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1] + + smuad r6, r6, r5 ; apply the filter + pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2] + smuad r7, r7, r5 + pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3] + + smuad r8, r8, r5 + smuad r9, r9, r5 + + add r0, r0, #4 + subs lr, lr, #1 + + add r6, r6, #0x40 ; round_shift_and_clamp + add r7, r7, #0x40 + usat r6, #16, r6, asr #7 + usat r7, #16, r7, asr #7 + + strh r6, [r1], r3 ; result is transposed and stored + + add r8, r8, #0x40 ; round_shift_and_clamp + strh r7, [r1], r3 + add r9, r9, #0x40 + usat r8, #16, r8, asr #7 + usat r9, #16, r9, asr #7 + + strh r8, [r1], r3 ; result is transposed and stored + + ldrneb r6, [r0] ; load source data + strh r9, [r1], r3 + + ldrneb r7, [r0, #1] + ldrneb r8, [r0, #2] + + bne bil_width_loop_1st_v6 + + add r0, r0, r2 ; move to next input row + subs r12, r12, #1 + + add r9, r2, r4, lsl #1 ; adding back block width + pld [r0, r9] ; preload next row + + add r11, r11, #2 ; move over to next column + mov r1, r11 + + bne bil_height_loop_1st_v6 + + ldmia sp!, {r4 - r11, pc} + +|bil_null_1st_filter| +|bil_height_loop_null_1st| + mov lr, r4, lsr #2 ; loop counter + +|bil_width_loop_null_1st| + ldrb r6, [r0] ; load data + ldrb r7, [r0, #1] + ldrb r8, [r0, #2] + ldrb r9, [r0, #3] + + strh r6, [r1], r3 ; store it to immediate buffer + add r0, r0, #4 + strh r7, [r1], r3 + subs lr, lr, #1 + strh r8, [r1], r3 + strh r9, [r1], r3 + + bne bil_width_loop_null_1st + + subs r12, r12, #1 + add r0, r0, r2 ; move to next input line + add r11, r11, #2 ; move over to next column + mov r1, r11 + + bne bil_height_loop_null_1st + + ldmia sp!, {r4 - r11, pc} + + ENDP ; |vp8_filter_block2d_bil_first_pass_armv6| + + +;--------------------------------- +; r0 unsigned short *src_ptr, +; r1 unsigned char *dst_ptr, +; r2 int dst_pitch, +; r3 unsigned int height, +; stack unsigned int width, +; stack const short *vp8_filter +;--------------------------------- +|vp8_filter_block2d_bil_second_pass_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; vp8_filter address + ldr r4, [sp, #36] ; width + + ldr r5, [r11] ; load up filter coefficients + mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix + mov r11, r1 + + cmp r5, #128 ; if filter coef = 128, then skip the filter + beq bil_null_2nd_filter + +|bil_height_loop_2nd| + ldr r6, [r0] ; load the data + ldr r8, [r0, #4] + ldrh r10, [r0, #8] + mov lr, r3, lsr #2 ; loop counter + +|bil_width_loop_2nd| + pkhtb r7, r6, r8 ; src[1] | src[2] + pkhtb r9, r8, r10 ; src[3] | src[4] + + smuad r6, r6, r5 ; apply filter + smuad r8, r8, r5 ; apply filter + + subs lr, lr, #1 + + smuadx r7, r7, r5 ; apply filter + smuadx r9, r9, r5 ; apply filter + + add r0, r0, #8 + + add r6, r6, #0x40 ; round_shift_and_clamp + add r7, r7, #0x40 + usat r6, #8, r6, asr #7 + usat r7, #8, r7, asr #7 + strb r6, [r1], r2 ; the result is transposed back and stored + + add r8, r8, #0x40 ; round_shift_and_clamp + strb r7, [r1], r2 + add r9, r9, #0x40 + usat r8, #8, r8, asr #7 + usat r9, #8, r9, asr #7 + strb r8, [r1], r2 ; the result is transposed back and stored + + ldrne r6, [r0] ; load data + strb r9, [r1], r2 + ldrne r8, [r0, #4] + ldrneh r10, [r0, #8] + + bne bil_width_loop_2nd + + subs r12, r12, #1 + add r0, r0, #4 ; update src for next row + add r11, r11, #1 + mov r1, r11 + + bne bil_height_loop_2nd + ldmia sp!, {r4 - r11, pc} + +|bil_null_2nd_filter| +|bil_height_loop_null_2nd| + mov lr, r3, lsr #2 + +|bil_width_loop_null_2nd| + ldr r6, [r0], #4 ; load data + subs lr, lr, #1 + ldr r8, [r0], #4 + + strb r6, [r1], r2 ; store data + mov r7, r6, lsr #16 + strb r7, [r1], r2 + mov r9, r8, lsr #16 + strb r8, [r1], r2 + strb r9, [r1], r2 + + bne bil_width_loop_null_2nd + + subs r12, r12, #1 + add r0, r0, #4 + add r11, r11, #1 + mov r1, r11 + + bne bil_height_loop_null_2nd + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_filter_block2d_second_pass_armv6| + + END diff --git a/vp8/common/arm/armv6/copymem16x16_v6.asm b/vp8/common/arm/armv6/copymem16x16_v6.asm new file mode 100644 index 0000000000000000000000000000000000000000..abf048c2fa88b4b4bee0c9e6288769b9072ccd07 --- /dev/null +++ b/vp8/common/arm/armv6/copymem16x16_v6.asm @@ -0,0 +1,186 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_copy_mem16x16_v6| + ; ARM + ; REQUIRE8 + ; PRESERVE8 + + AREA Block, CODE, READONLY ; name this block of code +;void copy_mem16x16_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp8_copy_mem16x16_v6| PROC + stmdb sp!, {r4 - r7} + ;push {r4-r7} + + ;preload + pld [r0, #31] ; preload for next 16x16 block + + ands r4, r0, #15 + beq copy_mem16x16_fast + + ands r4, r0, #7 + beq copy_mem16x16_8 + + ands r4, r0, #3 + beq copy_mem16x16_4 + + ;copy one byte each time + ldrb r4, [r0] + ldrb r5, [r0, #1] + ldrb r6, [r0, #2] + ldrb r7, [r0, #3] + + mov r12, #16 + +copy_mem16x16_1_loop + strb r4, [r2] + strb r5, [r2, #1] + strb r6, [r2, #2] + strb r7, [r2, #3] + + ldrb r4, [r0, #4] + ldrb r5, [r0, #5] + ldrb r6, [r0, #6] + ldrb r7, [r0, #7] + + subs r12, r12, #1 + + strb r4, [r2, #4] + strb r5, [r2, #5] + strb r6, [r2, #6] + strb r7, [r2, #7] + + ldrb r4, [r0, #8] + ldrb r5, [r0, #9] + ldrb r6, [r0, #10] + ldrb r7, [r0, #11] + + strb r4, [r2, #8] + strb r5, [r2, #9] + strb r6, [r2, #10] + strb r7, [r2, #11] + + ldrb r4, [r0, #12] + ldrb r5, [r0, #13] + ldrb r6, [r0, #14] + ldrb r7, [r0, #15] + + add r0, r0, r1 + + strb r4, [r2, #12] + strb r5, [r2, #13] + strb r6, [r2, #14] + strb r7, [r2, #15] + + add r2, r2, r3 + + ldrneb r4, [r0] + ldrneb r5, [r0, #1] + ldrneb r6, [r0, #2] + ldrneb r7, [r0, #3] + + pld [r0, #31] ; preload for next 16x16 block + + bne copy_mem16x16_1_loop + + ldmia sp!, {r4 - r7} + ;pop {r4-r7} + mov pc, lr + +;copy 4 bytes each time +copy_mem16x16_4 + ldr r4, [r0] + ldr r5, [r0, #4] + ldr r6, [r0, #8] + ldr r7, [r0, #12] + + mov r12, #16 + +copy_mem16x16_4_loop + subs r12, r12, #1 + add r0, r0, r1 + + str r4, [r2] + str r5, [r2, #4] + str r6, [r2, #8] + str r7, [r2, #12] + + add r2, r2, r3 + + ldrne r4, [r0] + ldrne r5, [r0, #4] + ldrne r6, [r0, #8] + ldrne r7, [r0, #12] + + pld [r0, #31] ; preload for next 16x16 block + + bne copy_mem16x16_4_loop + + ldmia sp!, {r4 - r7} + ;pop {r4-r7} + mov pc, lr + +;copy 8 bytes each time +copy_mem16x16_8 + sub r1, r1, #16 + sub r3, r3, #16 + + mov r12, #16 + +copy_mem16x16_8_loop + ldmia r0!, {r4-r5} + ;ldm r0, {r4-r5} + ldmia r0!, {r6-r7} + + add r0, r0, r1 + + stmia r2!, {r4-r5} + subs r12, r12, #1 + ;stm r2, {r4-r5} + stmia r2!, {r6-r7} + + add r2, r2, r3 + + pld [r0, #31] ; preload for next 16x16 block + bne copy_mem16x16_8_loop + + ldmia sp!, {r4 - r7} + ;pop {r4-r7} + mov pc, lr + +;copy 16 bytes each time +copy_mem16x16_fast + ;sub r1, r1, #16 + ;sub r3, r3, #16 + + mov r12, #16 + +copy_mem16x16_fast_loop + ldmia r0, {r4-r7} + ;ldm r0, {r4-r7} + add r0, r0, r1 + + subs r12, r12, #1 + stmia r2, {r4-r7} + ;stm r2, {r4-r7} + add r2, r2, r3 + + pld [r0, #31] ; preload for next 16x16 block + bne copy_mem16x16_fast_loop + + ldmia sp!, {r4 - r7} + ;pop {r4-r7} + mov pc, lr + + ENDP ; |vp8_copy_mem16x16_v6| + + END diff --git a/vp8/common/arm/armv6/copymem8x4_v6.asm b/vp8/common/arm/armv6/copymem8x4_v6.asm new file mode 100644 index 0000000000000000000000000000000000000000..d8362ef052fbe713d2ac2e5c432131fa95489438 --- /dev/null +++ b/vp8/common/arm/armv6/copymem8x4_v6.asm @@ -0,0 +1,128 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_copy_mem8x4_v6| + ; ARM + ; REQUIRE8 + ; PRESERVE8 + + AREA Block, CODE, READONLY ; name this block of code +;void vp8_copy_mem8x4_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp8_copy_mem8x4_v6| PROC + ;push {r4-r5} + stmdb sp!, {r4-r5} + + ;preload + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + ands r4, r0, #7 + beq copy_mem8x4_fast + + ands r4, r0, #3 + beq copy_mem8x4_4 + + ;copy 1 byte each time + ldrb r4, [r0] + ldrb r5, [r0, #1] + + mov r12, #4 + +copy_mem8x4_1_loop + strb r4, [r2] + strb r5, [r2, #1] + + ldrb r4, [r0, #2] + ldrb r5, [r0, #3] + + subs r12, r12, #1 + + strb r4, [r2, #2] + strb r5, [r2, #3] + + ldrb r4, [r0, #4] + ldrb r5, [r0, #5] + + strb r4, [r2, #4] + strb r5, [r2, #5] + + ldrb r4, [r0, #6] + ldrb r5, [r0, #7] + + add r0, r0, r1 + + strb r4, [r2, #6] + strb r5, [r2, #7] + + add r2, r2, r3 + + ldrneb r4, [r0] + ldrneb r5, [r0, #1] + + bne copy_mem8x4_1_loop + + ldmia sp!, {r4 - r5} + ;pop {r4-r5} + mov pc, lr + +;copy 4 bytes each time +copy_mem8x4_4 + ldr r4, [r0] + ldr r5, [r0, #4] + + mov r12, #4 + +copy_mem8x4_4_loop + subs r12, r12, #1 + add r0, r0, r1 + + str r4, [r2] + str r5, [r2, #4] + + add r2, r2, r3 + + ldrne r4, [r0] + ldrne r5, [r0, #4] + + bne copy_mem8x4_4_loop + + ldmia sp!, {r4-r5} + ;pop {r4-r5} + mov pc, lr + +;copy 8 bytes each time +copy_mem8x4_fast + ;sub r1, r1, #8 + ;sub r3, r3, #8 + + mov r12, #4 + +copy_mem8x4_fast_loop + ldmia r0, {r4-r5} + ;ldm r0, {r4-r5} + add r0, r0, r1 + + subs r12, r12, #1 + stmia r2, {r4-r5} + ;stm r2, {r4-r5} + add r2, r2, r3 + + bne copy_mem8x4_fast_loop + + ldmia sp!, {r4-r5} + ;pop {r4-r5} + mov pc, lr + + ENDP ; |vp8_copy_mem8x4_v6| + + END diff --git a/vp8/common/arm/armv6/copymem8x8_v6.asm b/vp8/common/arm/armv6/copymem8x8_v6.asm new file mode 100644 index 0000000000000000000000000000000000000000..c6a60c610bb8deed1b1e38428dc6fc945cf8049b --- /dev/null +++ b/vp8/common/arm/armv6/copymem8x8_v6.asm @@ -0,0 +1,128 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_copy_mem8x8_v6| + ; ARM + ; REQUIRE8 + ; PRESERVE8 + + AREA Block, CODE, READONLY ; name this block of code +;void copy_mem8x8_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp8_copy_mem8x8_v6| PROC + ;push {r4-r5} + stmdb sp!, {r4-r5} + + ;preload + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + ands r4, r0, #7 + beq copy_mem8x8_fast + + ands r4, r0, #3 + beq copy_mem8x8_4 + + ;copy 1 byte each time + ldrb r4, [r0] + ldrb r5, [r0, #1] + + mov r12, #8 + +copy_mem8x8_1_loop + strb r4, [r2] + strb r5, [r2, #1] + + ldrb r4, [r0, #2] + ldrb r5, [r0, #3] + + subs r12, r12, #1 + + strb r4, [r2, #2] + strb r5, [r2, #3] + + ldrb r4, [r0, #4] + ldrb r5, [r0, #5] + + strb r4, [r2, #4] + strb r5, [r2, #5] + + ldrb r4, [r0, #6] + ldrb r5, [r0, #7] + + add r0, r0, r1 + + strb r4, [r2, #6] + strb r5, [r2, #7] + + add r2, r2, r3 + + ldrneb r4, [r0] + ldrneb r5, [r0, #1] + + bne copy_mem8x8_1_loop + + ldmia sp!, {r4 - r5} + ;pop {r4-r5} + mov pc, lr + +;copy 4 bytes each time +copy_mem8x8_4 + ldr r4, [r0] + ldr r5, [r0, #4] + + mov r12, #8 + +copy_mem8x8_4_loop + subs r12, r12, #1 + add r0, r0, r1 + + str r4, [r2] + str r5, [r2, #4] + + add r2, r2, r3 + + ldrne r4, [r0] + ldrne r5, [r0, #4] + + bne copy_mem8x8_4_loop + + ldmia sp!, {r4 - r5} + ;pop {r4-r5} + mov pc, lr + +;copy 8 bytes each time +copy_mem8x8_fast + ;sub r1, r1, #8 + ;sub r3, r3, #8 + + mov r12, #8 + +copy_mem8x8_fast_loop + ldmia r0, {r4-r5} + ;ldm r0, {r4-r5} + add r0, r0, r1 + + subs r12, r12, #1 + stmia r2, {r4-r5} + ;stm r2, {r4-r5} + add r2, r2, r3 + + bne copy_mem8x8_fast_loop + + ldmia sp!, {r4-r5} + ;pop {r4-r5} + mov pc, lr + + ENDP ; |vp8_copy_mem8x8_v6| + + END diff --git a/vp8/common/arm/armv6/dc_only_idct_add_v6.asm b/vp8/common/arm/armv6/dc_only_idct_add_v6.asm new file mode 100644 index 0000000000000000000000000000000000000000..9aa659fa70aa4feed90e4fabe2f1cc7679879040 --- /dev/null +++ b/vp8/common/arm/armv6/dc_only_idct_add_v6.asm @@ -0,0 +1,70 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + EXPORT |vp8_dc_only_idct_add_v6| + + AREA |.text|, CODE, READONLY + +;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, +; int pred_stride, unsigned char *dst_ptr, +; int dst_stride) +; r0 input_dc +; r1 pred_ptr +; r2 pred_stride +; r3 dst_ptr +; sp dst_stride + +|vp8_dc_only_idct_add_v6| PROC + stmdb sp!, {r4 - r7} + + add r0, r0, #4 ; input_dc += 4 + ldr r12, c0x0000FFFF + ldr r4, [r1], r2 + and r0, r12, r0, asr #3 ; input_dc >> 3 + mask + ldr r6, [r1], r2 + orr r0, r0, r0, lsl #16 ; a1 | a1 + + ldr r12, [sp, #16] ; dst stride + + uxtab16 r5, r0, r4 ; a1+2 | a1+0 + uxtab16 r4, r0, r4, ror #8 ; a1+3 | a1+1 + uxtab16 r7, r0, r6 + uxtab16 r6, r0, r6, ror #8 + usat16 r5, #8, r5 + usat16 r4, #8, r4 + usat16 r7, #8, r7 + usat16 r6, #8, r6 + orr r5, r5, r4, lsl #8 + orr r7, r7, r6, lsl #8 + ldr r4, [r1], r2 + str r5, [r3], r12 + ldr r6, [r1] + str r7, [r3], r12 + + uxtab16 r5, r0, r4 + uxtab16 r4, r0, r4, ror #8 + uxtab16 r7, r0, r6 + uxtab16 r6, r0, r6, ror #8 + usat16 r5, #8, r5 + usat16 r4, #8, r4 + usat16 r7, #8, r7 + usat16 r6, #8, r6 + orr r5, r5, r4, lsl #8 + orr r7, r7, r6, lsl #8 + str r5, [r3], r12 + str r7, [r3] + + ldmia sp!, {r4 - r7} + bx lr + + ENDP ; |vp8_dc_only_idct_add_v6| + +; Constant Pool +c0x0000FFFF DCD 0x0000FFFF + END diff --git a/vp8/common/arm/armv6/dequant_idct_v6.asm b/vp8/common/arm/armv6/dequant_idct_v6.asm new file mode 100644 index 0000000000000000000000000000000000000000..2510ad838357c76e3f0e00a4855d13e0e1bd2095 --- /dev/null +++ b/vp8/common/arm/armv6/dequant_idct_v6.asm @@ -0,0 +1,190 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + EXPORT |vp8_dequant_idct_add_v6| + + AREA |.text|, CODE, READONLY +;void vp8_dequant_idct_v6(short *input, short *dq, +; unsigned char *dest, int stride) +; r0 = q +; r1 = dq +; r2 = dst +; r3 = stride + +|vp8_dequant_idct_add_v6| PROC + stmdb sp!, {r4-r11, lr} + + ldr r4, [r0] ;input + ldr r5, [r1], #4 ;dq + + sub sp, sp, #4 + str r3, [sp] + + mov r12, #4 + +vp8_dequant_add_loop + smulbb r6, r4, r5 + smultt r7, r4, r5 + + ldr r4, [r0, #4] ;input + ldr r5, [r1], #4 ;dq + + strh r6, [r0], #2 + strh r7, [r0], #2 + + smulbb r6, r4, r5 + smultt r7, r4, r5 + + subs r12, r12, #1 + + ldrne r4, [r0, #4] + ldrne r5, [r1], #4 + + strh r6, [r0], #2 + strh r7, [r0], #2 + + bne vp8_dequant_add_loop + + sub r0, r0, #32 + mov r1, r0 + +; short_idct4x4llm_v6_dual + ldr r3, cospi8sqrt2minus1 + ldr r4, sinpi8sqrt2 + ldr r6, [r0, #8] + mov r5, #2 +vp8_dequant_idct_loop1_v6 + ldr r12, [r0, #24] + ldr r14, [r0, #16] + smulwt r9, r3, r6 + smulwb r7, r3, r6 + smulwt r10, r4, r6 + smulwb r8, r4, r6 + pkhbt r7, r7, r9, lsl #16 + smulwt r11, r3, r12 + pkhbt r8, r8, r10, lsl #16 + uadd16 r6, r6, r7 + smulwt r7, r4, r12 + smulwb r9, r3, r12 + smulwb r10, r4, r12 + subs r5, r5, #1 + pkhbt r9, r9, r11, lsl #16 + ldr r11, [r0], #4 + pkhbt r10, r10, r7, lsl #16 + uadd16 r7, r12, r9 + usub16 r7, r8, r7 + uadd16 r6, r6, r10 + uadd16 r10, r11, r14 + usub16 r8, r11, r14 + uadd16 r9, r10, r6 + usub16 r10, r10, r6 + uadd16 r6, r8, r7 + usub16 r7, r8, r7 + str r6, [r1, #8] + ldrne r6, [r0, #8] + str r7, [r1, #16] + str r10, [r1, #24] + str r9, [r1], #4 + bne vp8_dequant_idct_loop1_v6 + + mov r5, #2 + sub r0, r1, #8 +vp8_dequant_idct_loop2_v6 + ldr r6, [r0], #4 + ldr r7, [r0], #4 + ldr r8, [r0], #4 + ldr r9, [r0], #4 + smulwt r1, r3, r6 + smulwt r12, r4, r6 + smulwt lr, r3, r8 + smulwt r10, r4, r8 + pkhbt r11, r8, r6, lsl #16 + pkhbt r1, lr, r1, lsl #16 + pkhbt r12, r10, r12, lsl #16 + pkhtb r6, r6, r8, asr #16 + uadd16 r6, r1, r6 + pkhbt lr, r9, r7, lsl #16 + uadd16 r10, r11, lr + usub16 lr, r11, lr + pkhtb r8, r7, r9, asr #16 + subs r5, r5, #1 + smulwt r1, r3, r8 + smulwb r7, r3, r8 + smulwt r11, r4, r8 + smulwb r9, r4, r8 + pkhbt r1, r7, r1, lsl #16 + uadd16 r8, r1, r8 + pkhbt r11, r9, r11, lsl #16 + usub16 r1, r12, r8 + uadd16 r8, r11, r6 + ldr r9, c0x00040004 + ldr r12, [sp] ; get stride from stack + uadd16 r6, r10, r8 + usub16 r7, r10, r8 + uadd16 r7, r7, r9 + uadd16 r6, r6, r9 + uadd16 r10, r14, r1 + usub16 r1, r14, r1 + uadd16 r10, r10, r9 + uadd16 r1, r1, r9 + ldr r11, [r2] ; load input from dst + mov r8, r7, asr #3 + pkhtb r9, r8, r10, asr #19 + mov r8, r1, asr #3 + pkhtb r8, r8, r6, asr #19 + uxtb16 lr, r11, ror #8 + qadd16 r9, r9, lr + uxtb16 lr, r11 + qadd16 r8, r8, lr + usat16 r9, #8, r9 + usat16 r8, #8, r8 + orr r9, r8, r9, lsl #8 + ldr r11, [r2, r12] ; load input from dst + mov r7, r7, lsl #16 + mov r1, r1, lsl #16 + mov r10, r10, lsl #16 + mov r6, r6, lsl #16 + mov r7, r7, asr #3 + pkhtb r7, r7, r10, asr #19 + mov r1, r1, asr #3 + pkhtb r1, r1, r6, asr #19 + uxtb16 r8, r11, ror #8 + qadd16 r7, r7, r8 + uxtb16 r8, r11 + qadd16 r1, r1, r8 + usat16 r7, #8, r7 + usat16 r1, #8, r1 + orr r1, r1, r7, lsl #8 + str r9, [r2], r12 ; store output to dst + str r1, [r2], r12 ; store output to dst + bne vp8_dequant_idct_loop2_v6 + +; vpx_memset + sub r0, r0, #32 + add sp, sp, #4 + + mov r12, #0 + str r12, [r0] + str r12, [r0, #4] + str r12, [r0, #8] + str r12, [r0, #12] + str r12, [r0, #16] + str r12, [r0, #20] + str r12, [r0, #24] + str r12, [r0, #28] + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_dequant_idct_add_v6| + +; Constant Pool +cospi8sqrt2minus1 DCD 0x00004E7B +sinpi8sqrt2 DCD 0x00008A8C +c0x00040004 DCD 0x00040004 + + END diff --git a/vp8/common/arm/armv6/dequantize_v6.asm b/vp8/common/arm/armv6/dequantize_v6.asm new file mode 100644 index 0000000000000000000000000000000000000000..72f7e0ee57febf8af9c25d55f5d851c8f56dee6e --- /dev/null +++ b/vp8/common/arm/armv6/dequantize_v6.asm @@ -0,0 +1,69 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_dequantize_b_loop_v6| + + AREA |.text|, CODE, READONLY ; name this block of code +;------------------------------- +;void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ); +; r0 short *Q, +; r1 short *DQC +; r2 short *DQ +|vp8_dequantize_b_loop_v6| PROC + stmdb sp!, {r4-r9, lr} + + ldr r3, [r0] ;load Q + ldr r4, [r1] ;load DQC + ldr r5, [r0, #4] + ldr r6, [r1, #4] + + mov r12, #2 ;loop counter + +dequant_loop + smulbb r7, r3, r4 ;multiply + smultt r8, r3, r4 + smulbb r9, r5, r6 + smultt lr, r5, r6 + + ldr r3, [r0, #8] + ldr r4, [r1, #8] + ldr r5, [r0, #12] + ldr r6, [r1, #12] + + strh r7, [r2], #2 ;store result + smulbb r7, r3, r4 ;multiply + strh r8, [r2], #2 + smultt r8, r3, r4 + strh r9, [r2], #2 + smulbb r9, r5, r6 + strh lr, [r2], #2 + smultt lr, r5, r6 + + subs r12, r12, #1 + + add r0, r0, #16 + add r1, r1, #16 + + ldrne r3, [r0] + strh r7, [r2], #2 ;store result + ldrne r4, [r1] + strh r8, [r2], #2 + ldrne r5, [r0, #4] + strh r9, [r2], #2 + ldrne r6, [r1, #4] + strh lr, [r2], #2 + + bne dequant_loop + + ldmia sp!, {r4-r9, pc} + ENDP ;|vp8_dequantize_b_loop_v6| + + END diff --git a/vp8/common/arm/armv6/filter_v6.asm b/vp8/common/arm/armv6/filter_v6.asm new file mode 100644 index 0000000000000000000000000000000000000000..1ba91ddd657a1da84f84426f82b3d9a37fcfb7cb --- /dev/null +++ b/vp8/common/arm/armv6/filter_v6.asm @@ -0,0 +1,624 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_filter_block2d_first_pass_armv6| + EXPORT |vp8_filter_block2d_first_pass_16x16_armv6| + EXPORT |vp8_filter_block2d_first_pass_8x8_armv6| + EXPORT |vp8_filter_block2d_second_pass_armv6| + EXPORT |vp8_filter4_block2d_second_pass_armv6| + EXPORT |vp8_filter_block2d_first_pass_only_armv6| + EXPORT |vp8_filter_block2d_second_pass_only_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code +;------------------------------------- +; r0 unsigned char *src_ptr +; r1 short *output_ptr +; r2 unsigned int src_pixels_per_line +; r3 unsigned int output_width +; stack unsigned int output_height +; stack const short *vp8_filter +;------------------------------------- +; vp8_filter the input and put in the output array. Apply the 6 tap FIR filter with +; the output being a 2 byte value and the intput being a 1 byte value. +|vp8_filter_block2d_first_pass_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; vp8_filter address + ldr r7, [sp, #36] ; output height + + sub r2, r2, r3 ; inside loop increments input array, + ; so the height loop only needs to add + ; r2 - width to the input pointer + + mov r3, r3, lsl #1 ; multiply width by 2 because using shorts + add r12, r3, #16 ; square off the output + sub sp, sp, #4 + + ldr r4, [r11] ; load up packed filter coefficients + ldr r5, [r11, #4] + ldr r6, [r11, #8] + + str r1, [sp] ; push destination to stack + mov r7, r7, lsl #16 ; height is top part of counter + +; six tap filter +|height_loop_1st_6| + ldrb r8, [r0, #-2] ; load source data + ldrb r9, [r0, #-1] + ldrb r10, [r0], #2 + orr r7, r7, r3, lsr #2 ; construct loop counter + +|width_loop_1st_6| + ldrb r11, [r0, #-1] + + pkhbt lr, r8, r9, lsl #16 ; r9 | r8 + pkhbt r8, r9, r10, lsl #16 ; r10 | r9 + + ldrb r9, [r0] + + smuad lr, lr, r4 ; apply the filter + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + smuad r8, r8, r4 + pkhbt r11, r11, r9, lsl #16 ; r9 | r11 + + smlad lr, r10, r5, lr + ldrb r10, [r0, #1] + smlad r8, r11, r5, r8 + ldrb r11, [r0, #2] + + sub r7, r7, #1 + + pkhbt r9, r9, r10, lsl #16 ; r10 | r9 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + + smlad lr, r9, r6, lr + smlad r11, r10, r6, r8 + + ands r10, r7, #0xff ; test loop counter + + add lr, lr, #0x40 ; round_shift_and_clamp + ldrneb r8, [r0, #-2] ; load data for next loop + usat lr, #8, lr, asr #7 + add r11, r11, #0x40 + ldrneb r9, [r0, #-1] + usat r11, #8, r11, asr #7 + + strh lr, [r1], r12 ; result is transposed and stored, which + ; will make second pass filtering easier. + ldrneb r10, [r0], #2 + strh r11, [r1], r12 + + bne width_loop_1st_6 + + ldr r1, [sp] ; load and update dst address + subs r7, r7, #0x10000 + add r0, r0, r2 ; move to next input line + + add r1, r1, #2 ; move over to next column + str r1, [sp] + + bne height_loop_1st_6 + + add sp, sp, #4 + ldmia sp!, {r4 - r11, pc} + + ENDP + +; -------------------------- +; 16x16 version +; ----------------------------- +|vp8_filter_block2d_first_pass_16x16_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; vp8_filter address + ldr r7, [sp, #36] ; output height + + add r4, r2, #18 ; preload next low + pld [r0, r4] + + sub r2, r2, r3 ; inside loop increments input array, + ; so the height loop only needs to add + ; r2 - width to the input pointer + + mov r3, r3, lsl #1 ; multiply width by 2 because using shorts + add r12, r3, #16 ; square off the output + sub sp, sp, #4 + + ldr r4, [r11] ; load up packed filter coefficients + ldr r5, [r11, #4] + ldr r6, [r11, #8] + + str r1, [sp] ; push destination to stack + mov r7, r7, lsl #16 ; height is top part of counter + +; six tap filter +|height_loop_1st_16_6| + ldrb r8, [r0, #-2] ; load source data + ldrb r9, [r0, #-1] + ldrb r10, [r0], #2 + orr r7, r7, r3, lsr #2 ; construct loop counter + +|width_loop_1st_16_6| + ldrb r11, [r0, #-1] + + pkhbt lr, r8, r9, lsl #16 ; r9 | r8 + pkhbt r8, r9, r10, lsl #16 ; r10 | r9 + + ldrb r9, [r0] + + smuad lr, lr, r4 ; apply the filter + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + smuad r8, r8, r4 + pkhbt r11, r11, r9, lsl #16 ; r9 | r11 + + smlad lr, r10, r5, lr + ldrb r10, [r0, #1] + smlad r8, r11, r5, r8 + ldrb r11, [r0, #2] + + sub r7, r7, #1 + + pkhbt r9, r9, r10, lsl #16 ; r10 | r9 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + + smlad lr, r9, r6, lr + smlad r11, r10, r6, r8 + + ands r10, r7, #0xff ; test loop counter + + add lr, lr, #0x40 ; round_shift_and_clamp + ldrneb r8, [r0, #-2] ; load data for next loop + usat lr, #8, lr, asr #7 + add r11, r11, #0x40 + ldrneb r9, [r0, #-1] + usat r11, #8, r11, asr #7 + + strh lr, [r1], r12 ; result is transposed and stored, which + ; will make second pass filtering easier. + ldrneb r10, [r0], #2 + strh r11, [r1], r12 + + bne width_loop_1st_16_6 + + ldr r1, [sp] ; load and update dst address + subs r7, r7, #0x10000 + add r0, r0, r2 ; move to next input line + + add r11, r2, #34 ; adding back block width(=16) + pld [r0, r11] ; preload next low + + add r1, r1, #2 ; move over to next column + str r1, [sp] + + bne height_loop_1st_16_6 + + add sp, sp, #4 + ldmia sp!, {r4 - r11, pc} + + ENDP + +; -------------------------- +; 8x8 version +; ----------------------------- +|vp8_filter_block2d_first_pass_8x8_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; vp8_filter address + ldr r7, [sp, #36] ; output height + + add r4, r2, #10 ; preload next low + pld [r0, r4] + + sub r2, r2, r3 ; inside loop increments input array, + ; so the height loop only needs to add + ; r2 - width to the input pointer + + mov r3, r3, lsl #1 ; multiply width by 2 because using shorts + add r12, r3, #16 ; square off the output + sub sp, sp, #4 + + ldr r4, [r11] ; load up packed filter coefficients + ldr r5, [r11, #4] + ldr r6, [r11, #8] + + str r1, [sp] ; push destination to stack + mov r7, r7, lsl #16 ; height is top part of counter + +; six tap filter +|height_loop_1st_8_6| + ldrb r8, [r0, #-2] ; load source data + ldrb r9, [r0, #-1] + ldrb r10, [r0], #2 + orr r7, r7, r3, lsr #2 ; construct loop counter + +|width_loop_1st_8_6| + ldrb r11, [r0, #-1] + + pkhbt lr, r8, r9, lsl #16 ; r9 | r8 + pkhbt r8, r9, r10, lsl #16 ; r10 | r9 + + ldrb r9, [r0] + + smuad lr, lr, r4 ; apply the filter + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + smuad r8, r8, r4 + pkhbt r11, r11, r9, lsl #16 ; r9 | r11 + + smlad lr, r10, r5, lr + ldrb r10, [r0, #1] + smlad r8, r11, r5, r8 + ldrb r11, [r0, #2] + + sub r7, r7, #1 + + pkhbt r9, r9, r10, lsl #16 ; r10 | r9 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + + smlad lr, r9, r6, lr + smlad r11, r10, r6, r8 + + ands r10, r7, #0xff ; test loop counter + + add lr, lr, #0x40 ; round_shift_and_clamp + ldrneb r8, [r0, #-2] ; load data for next loop + usat lr, #8, lr, asr #7 + add r11, r11, #0x40 + ldrneb r9, [r0, #-1] + usat r11, #8, r11, asr #7 + + strh lr, [r1], r12 ; result is transposed and stored, which + ; will make second pass filtering easier. + ldrneb r10, [r0], #2 + strh r11, [r1], r12 + + bne width_loop_1st_8_6 + + ldr r1, [sp] ; load and update dst address + subs r7, r7, #0x10000 + add r0, r0, r2 ; move to next input line + + add r11, r2, #18 ; adding back block width(=8) + pld [r0, r11] ; preload next low + + add r1, r1, #2 ; move over to next column + str r1, [sp] + + bne height_loop_1st_8_6 + + add sp, sp, #4 + ldmia sp!, {r4 - r11, pc} + + ENDP + +;--------------------------------- +; r0 short *src_ptr, +; r1 unsigned char *output_ptr, +; r2 unsigned int output_pitch, +; r3 unsigned int cnt, +; stack const short *vp8_filter +;--------------------------------- +|vp8_filter_block2d_second_pass_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #36] ; vp8_filter address + sub sp, sp, #4 + mov r7, r3, lsl #16 ; height is top part of counter + str r1, [sp] ; push destination to stack + + ldr r4, [r11] ; load up packed filter coefficients + ldr r5, [r11, #4] + ldr r6, [r11, #8] + + pkhbt r12, r5, r4 ; pack the filter differently + pkhbt r11, r6, r5 + + sub r0, r0, #4 ; offset input buffer + +|height_loop_2nd| + ldr r8, [r0] ; load the data + ldr r9, [r0, #4] + orr r7, r7, r3, lsr #1 ; loop counter + +|width_loop_2nd| + smuad lr, r4, r8 ; apply filter + sub r7, r7, #1 + smulbt r8, r4, r8 + + ldr r10, [r0, #8] + + smlad lr, r5, r9, lr + smladx r8, r12, r9, r8 + + ldrh r9, [r0, #12] + + smlad lr, r6, r10, lr + smladx r8, r11, r10, r8 + + add r0, r0, #4 + smlatb r10, r6, r9, r8 + + add lr, lr, #0x40 ; round_shift_and_clamp + ands r8, r7, #0xff + usat lr, #8, lr, asr #7 + add r10, r10, #0x40 + strb lr, [r1], r2 ; the result is transposed back and stored + usat r10, #8, r10, asr #7 + + ldrne r8, [r0] ; load data for next loop + ldrne r9, [r0, #4] + strb r10, [r1], r2 + + bne width_loop_2nd + + ldr r1, [sp] ; update dst for next loop + subs r7, r7, #0x10000 + add r0, r0, #16 ; updata src for next loop + add r1, r1, #1 + str r1, [sp] + + bne height_loop_2nd + + add sp, sp, #4 + ldmia sp!, {r4 - r11, pc} + + ENDP + +;--------------------------------- +; r0 short *src_ptr, +; r1 unsigned char *output_ptr, +; r2 unsigned int output_pitch, +; r3 unsigned int cnt, +; stack const short *vp8_filter +;--------------------------------- +|vp8_filter4_block2d_second_pass_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #36] ; vp8_filter address + mov r7, r3, lsl #16 ; height is top part of counter + + ldr r4, [r11] ; load up packed filter coefficients + add lr, r1, r3 ; save final destination pointer + ldr r5, [r11, #4] + ldr r6, [r11, #8] + + pkhbt r12, r5, r4 ; pack the filter differently + pkhbt r11, r6, r5 + mov r4, #0x40 ; rounding factor (for smlad{x}) + +|height_loop_2nd_4| + ldrd r8, [r0, #-4] ; load the data + orr r7, r7, r3, lsr #1 ; loop counter + +|width_loop_2nd_4| + ldr r10, [r0, #4]! + smladx r6, r9, r12, r4 ; apply filter + pkhbt r8, r9, r8 + smlad r5, r8, r12, r4 + pkhbt r8, r10, r9 + smladx r6, r10, r11, r6 + sub r7, r7, #1 + smlad r5, r8, r11, r5 + + mov r8, r9 ; shift the data for the next loop + mov r9, r10 + + usat r6, #8, r6, asr #7 ; shift and clamp + usat r5, #8, r5, asr #7 + + strb r5, [r1], r2 ; the result is transposed back and stored + tst r7, #0xff + strb r6, [r1], r2 + + bne width_loop_2nd_4 + + subs r7, r7, #0x10000 + add r0, r0, #16 ; update src for next loop + sub r1, lr, r7, lsr #16 ; update dst for next loop + + bne height_loop_2nd_4 + + ldmia sp!, {r4 - r11, pc} + + ENDP + +;------------------------------------ +; r0 unsigned char *src_ptr +; r1 unsigned char *output_ptr, +; r2 unsigned int src_pixels_per_line +; r3 unsigned int cnt, +; stack unsigned int output_pitch, +; stack const short *vp8_filter +;------------------------------------ +|vp8_filter_block2d_first_pass_only_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + add r7, r2, r3 ; preload next low + add r7, r7, #2 + pld [r0, r7] + + ldr r4, [sp, #36] ; output pitch + ldr r11, [sp, #40] ; HFilter address + sub sp, sp, #8 + + mov r7, r3 + sub r2, r2, r3 ; inside loop increments input array, + ; so the height loop only needs to add + ; r2 - width to the input pointer + + sub r4, r4, r3 + str r4, [sp] ; save modified output pitch + str r2, [sp, #4] + + mov r2, #0x40 + + ldr r4, [r11] ; load up packed filter coefficients + ldr r5, [r11, #4] + ldr r6, [r11, #8] + +; six tap filter +|height_loop_1st_only_6| + ldrb r8, [r0, #-2] ; load data + ldrb r9, [r0, #-1] + ldrb r10, [r0], #2 + + mov r12, r3, lsr #1 ; loop counter + +|width_loop_1st_only_6| + ldrb r11, [r0, #-1] + + pkhbt lr, r8, r9, lsl #16 ; r9 | r8 + pkhbt r8, r9, r10, lsl #16 ; r10 | r9 + + ldrb r9, [r0] + +;; smuad lr, lr, r4 + smlad lr, lr, r4, r2 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 +;; smuad r8, r8, r4 + smlad r8, r8, r4, r2 + pkhbt r11, r11, r9, lsl #16 ; r9 | r11 + + smlad lr, r10, r5, lr + ldrb r10, [r0, #1] + smlad r8, r11, r5, r8 + ldrb r11, [r0, #2] + + subs r12, r12, #1 + + pkhbt r9, r9, r10, lsl #16 ; r10 | r9 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + + smlad lr, r9, r6, lr + smlad r10, r10, r6, r8 + +;; add lr, lr, #0x40 ; round_shift_and_clamp + ldrneb r8, [r0, #-2] ; load data for next loop + usat lr, #8, lr, asr #7 +;; add r10, r10, #0x40 + strb lr, [r1], #1 ; store the result + usat r10, #8, r10, asr #7 + + ldrneb r9, [r0, #-1] + strb r10, [r1], #1 + ldrneb r10, [r0], #2 + + bne width_loop_1st_only_6 + + ldr lr, [sp] ; load back output pitch + ldr r12, [sp, #4] ; load back output pitch + subs r7, r7, #1 + add r0, r0, r12 ; updata src for next loop + + add r11, r12, r3 ; preload next low + add r11, r11, #2 + pld [r0, r11] + + add r1, r1, lr ; update dst for next loop + + bne height_loop_1st_only_6 + + add sp, sp, #8 + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_filter_block2d_first_pass_only_armv6| + + +;------------------------------------ +; r0 unsigned char *src_ptr, +; r1 unsigned char *output_ptr, +; r2 unsigned int src_pixels_per_line +; r3 unsigned int cnt, +; stack unsigned int output_pitch, +; stack const short *vp8_filter +;------------------------------------ +|vp8_filter_block2d_second_pass_only_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; VFilter address + ldr r12, [sp, #36] ; output pitch + + mov r7, r3, lsl #16 ; height is top part of counter + sub r0, r0, r2, lsl #1 ; need 6 elements for filtering, 2 before, 3 after + + sub sp, sp, #8 + + ldr r4, [r11] ; load up packed filter coefficients + ldr r5, [r11, #4] + ldr r6, [r11, #8] + + str r0, [sp] ; save r0 to stack + str r1, [sp, #4] ; save dst to stack + +; six tap filter +|width_loop_2nd_only_6| + ldrb r8, [r0], r2 ; load data + orr r7, r7, r3 ; loop counter + ldrb r9, [r0], r2 + ldrb r10, [r0], r2 + +|height_loop_2nd_only_6| + ; filter first column in this inner loop, than, move to next colum. + ldrb r11, [r0], r2 + + pkhbt lr, r8, r9, lsl #16 ; r9 | r8 + pkhbt r8, r9, r10, lsl #16 ; r10 | r9 + + ldrb r9, [r0], r2 + + smuad lr, lr, r4 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + smuad r8, r8, r4 + pkhbt r11, r11, r9, lsl #16 ; r9 | r11 + + smlad lr, r10, r5, lr + ldrb r10, [r0], r2 + smlad r8, r11, r5, r8 + ldrb r11, [r0] + + sub r7, r7, #2 + sub r0, r0, r2, lsl #2 + + pkhbt r9, r9, r10, lsl #16 ; r10 | r9 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + + smlad lr, r9, r6, lr + smlad r10, r10, r6, r8 + + ands r9, r7, #0xff + + add lr, lr, #0x40 ; round_shift_and_clamp + ldrneb r8, [r0], r2 ; load data for next loop + usat lr, #8, lr, asr #7 + add r10, r10, #0x40 + strb lr, [r1], r12 ; store the result for the column + usat r10, #8, r10, asr #7 + + ldrneb r9, [r0], r2 + strb r10, [r1], r12 + ldrneb r10, [r0], r2 + + bne height_loop_2nd_only_6 + + ldr r0, [sp] + ldr r1, [sp, #4] + subs r7, r7, #0x10000 + add r0, r0, #1 ; move to filter next column + str r0, [sp] + add r1, r1, #1 + str r1, [sp, #4] + + bne width_loop_2nd_only_6 + + add sp, sp, #8 + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_filter_block2d_second_pass_only_armv6| + + END diff --git a/vp8/common/arm/armv6/idct_blk_v6.c b/vp8/common/arm/armv6/idct_blk_v6.c new file mode 100644 index 0000000000000000000000000000000000000000..6002c0f12e176036b1731d20259c50d077ff76b1 --- /dev/null +++ b/vp8/common/arm/armv6/idct_blk_v6.c @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vpx_rtcd.h" + + +void vp8_dequant_idct_add_y_block_v6(short *q, short *dq, + unsigned char *dst, + int stride, char *eobs) +{ + int i; + + for (i = 0; i < 4; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_v6 (q, dq, dst, stride); + else if (eobs[0] == 1) + { + vp8_dc_only_idct_add_v6 (q[0]*dq[0], dst, stride, dst, stride); + ((int *)q)[0] = 0; + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_v6 (q+16, dq, dst+4, stride); + else if (eobs[1] == 1) + { + vp8_dc_only_idct_add_v6 (q[16]*dq[0], dst+4, stride, dst+4, stride); + ((int *)(q+16))[0] = 0; + } + + if (eobs[2] > 1) + vp8_dequant_idct_add_v6 (q+32, dq, dst+8, stride); + else if (eobs[2] == 1) + { + vp8_dc_only_idct_add_v6 (q[32]*dq[0], dst+8, stride, dst+8, stride); + ((int *)(q+32))[0] = 0; + } + + if (eobs[3] > 1) + vp8_dequant_idct_add_v6 (q+48, dq, dst+12, stride); + else if (eobs[3] == 1) + { + vp8_dc_only_idct_add_v6 (q[48]*dq[0], dst+12, stride,dst+12,stride); + ((int *)(q+48))[0] = 0; + } + + q += 64; + dst += 4*stride; + eobs += 4; + } +} + +void vp8_dequant_idct_add_uv_block_v6(short *q, short *dq, + unsigned char *dstu, + unsigned char *dstv, + int stride, char *eobs) +{ + int i; + + for (i = 0; i < 2; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_v6 (q, dq, dstu, stride); + else if (eobs[0] == 1) + { + vp8_dc_only_idct_add_v6 (q[0]*dq[0], dstu, stride, dstu, stride); + ((int *)q)[0] = 0; + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_v6 (q+16, dq, dstu+4, stride); + else if (eobs[1] == 1) + { + vp8_dc_only_idct_add_v6 (q[16]*dq[0], dstu+4, stride, + dstu+4, stride); + ((int *)(q+16))[0] = 0; + } + + q += 32; + dstu += 4*stride; + eobs += 2; + } + + for (i = 0; i < 2; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_v6 (q, dq, dstv, stride); + else if (eobs[0] == 1) + { + vp8_dc_only_idct_add_v6 (q[0]*dq[0], dstv, stride, dstv, stride); + ((int *)q)[0] = 0; + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_v6 (q+16, dq, dstv+4, stride); + else if (eobs[1] == 1) + { + vp8_dc_only_idct_add_v6 (q[16]*dq[0], dstv+4, stride, + dstv+4, stride); + ((int *)(q+16))[0] = 0; + } + + q += 32; + dstv += 4*stride; + eobs += 2; + } +} diff --git a/vp8/common/arm/armv6/idct_v6.asm b/vp8/common/arm/armv6/idct_v6.asm new file mode 100644 index 0000000000000000000000000000000000000000..b4d44cbeba4197699c0c307cccd65c815b79643e --- /dev/null +++ b/vp8/common/arm/armv6/idct_v6.asm @@ -0,0 +1,202 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_short_idct4x4llm_v6_dual| + + AREA |.text|, CODE, READONLY + + +; void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, +; unsigned char *dst, int stride) +; r0 short* input +; r1 unsigned char* pred +; r2 int pitch +; r3 unsigned char* dst +; sp int stride + +|vp8_short_idct4x4llm_v6_dual| PROC + stmdb sp!, {r4-r11, lr} + + sub sp, sp, #4 + + mov r4, #0x00008A00 ; sin + orr r4, r4, #0x0000008C ; sinpi8sqrt2 + + mov r5, #0x00004E00 ; cos + orr r5, r5, #0x0000007B ; cospi8sqrt2minus1 + orr r5, r5, #1<<31 ; loop counter on top bit + +loop1_dual + ldr r6, [r0, #(4*2)] ; i5 | i4 + ldr r12, [r0, #(12*2)] ; i13|i12 + ldr r14, [r0, #(8*2)] ; i9 | i8 + + smulbt r9, r5, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 + smulbb r7, r5, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 + smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 + smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 + + smulbt r11, r5, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 + pkhtb r7, r9, r7, asr #16 ; 5c | 4c + pkhbt r8, r8, r10, lsl #16 ; 5s | 4s + uadd16 r6, r6, r7 ; 5c+5 | 4c+4 + + smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 + smulbb r9, r5, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 + smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 + + subs r5, r5, #1<<31 ; i-- + + pkhtb r9, r11, r9, asr #16 ; 13c | 12c + ldr r11, [r0] ; i1 | i0 + pkhbt r10, r10, r7, lsl #16 ; 13s | 12s + uadd16 r7, r12, r9 ; 13c+13 | 12c+12 + + usub16 r7, r8, r7 ; c + uadd16 r6, r6, r10 ; d + uadd16 r10, r11, r14 ; a + usub16 r8, r11, r14 ; b + + uadd16 r9, r10, r6 ; a+d + usub16 r10, r10, r6 ; a-d + uadd16 r6, r8, r7 ; b+c + usub16 r7, r8, r7 ; b-c + + ; use input buffer to store intermediate results + str r6, [r0, #(4*2)] ; o5 | o4 + str r7, [r0, #(8*2)] ; o9 | o8 + str r10,[r0, #(12*2)] ; o13|o12 + str r9, [r0], #4 ; o1 | o0 + + bcs loop1_dual + + sub r0, r0, #8 ; reset input/output + str r0, [sp] + +loop2_dual + + ldr r6, [r0, #(4*2)] ; i5 | i4 + ldr r12,[r0, #(2*2)] ; i3 | i2 + ldr r14,[r0, #(6*2)] ; i7 | i6 + ldr r0, [r0, #(0*2)] ; i1 | i0 + + smulbt r9, r5, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 + smulbt r7, r5, r0 ; (ip[1] * cospi8sqrt2minus1) >> 16 + smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 + smulwt r8, r4, r0 ; (ip[1] * sinpi8sqrt2) >> 16 + + pkhbt r11, r6, r0, lsl #16 ; i0 | i4 + pkhtb r7, r7, r9, asr #16 ; 1c | 5c + pkhtb r0, r0, r6, asr #16 ; i1 | i5 + pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 + + uadd16 r0, r7, r0 ; 1c+1 | 5c+5 = temp2 + pkhbt r9, r14, r12, lsl #16 ; i2 | i6 + uadd16 r10, r11, r9 ; a + usub16 r9, r11, r9 ; b + pkhtb r6, r12, r14, asr #16 ; i3 | i7 + + subs r5, r5, #1<<31 ; i-- + + smulbt r7, r5, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 + smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 + smulbb r12, r5, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 + smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 + + pkhtb r7, r7, r12, asr #16 ; 3c | 7c + pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 + + uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 + usub16 r12, r8, r6 ; c (o1 | o5) + uadd16 r6, r11, r0 ; d (o3 | o7) + uadd16 r7, r10, r6 ; a+d + + mov r8, #4 ; set up 4's + orr r8, r8, #0x40000 ; 4|4 + + usub16 r6, r10, r6 ; a-d + uadd16 r6, r6, r8 ; a-d+4, 3|7 + uadd16 r7, r7, r8 ; a+d+4, 0|4 + uadd16 r10, r9, r12 ; b+c + usub16 r0, r9, r12 ; b-c + uadd16 r10, r10, r8 ; b+c+4, 1|5 + uadd16 r8, r0, r8 ; b-c+4, 2|6 + + ldr lr, [sp, #40] ; dst stride + + ldrb r0, [r1] ; pred p0 + ldrb r11, [r1, #1] ; pred p1 + ldrb r12, [r1, #2] ; pred p2 + + add r0, r0, r7, asr #19 ; p0 + o0 + add r11, r11, r10, asr #19 ; p1 + o1 + add r12, r12, r8, asr #19 ; p2 + o2 + + usat r0, #8, r0 ; d0 = clip8(p0 + o0) + usat r11, #8, r11 ; d1 = clip8(p1 + o1) + usat r12, #8, r12 ; d2 = clip8(p2 + o2) + + add r0, r0, r11, lsl #8 ; |--|--|d1|d0| + + ldrb r11, [r1, #3] ; pred p3 + + add r0, r0, r12, lsl #16 ; |--|d2|d1|d0| + + add r11, r11, r6, asr #19 ; p3 + o3 + + sxth r7, r7 ; + sxth r10, r10 ; + + usat r11, #8, r11 ; d3 = clip8(p3 + o3) + + sxth r8, r8 ; + sxth r6, r6 ; + + add r0, r0, r11, lsl #24 ; |d3|d2|d1|d0| + + ldrb r12, [r1, r2]! ; pred p4 + str r0, [r3], lr + ldrb r11, [r1, #1] ; pred p5 + + add r12, r12, r7, asr #3 ; p4 + o4 + add r11, r11, r10, asr #3 ; p5 + o5 + + usat r12, #8, r12 ; d4 = clip8(p4 + o4) + usat r11, #8, r11 ; d5 = clip8(p5 + o5) + + ldrb r7, [r1, #2] ; pred p6 + ldrb r10, [r1, #3] ; pred p6 + + add r12, r12, r11, lsl #8 ; |--|--|d5|d4| + + add r7, r7, r8, asr #3 ; p6 + o6 + add r10, r10, r6, asr #3 ; p7 + o7 + + ldr r0, [sp] ; load input pointer + + usat r7, #8, r7 ; d6 = clip8(p6 + o6) + usat r10, #8, r10 ; d7 = clip8(p7 + o7) + + add r12, r12, r7, lsl #16 ; |--|d6|d5|d4| + add r12, r12, r10, lsl #24 ; |d7|d6|d5|d4| + + str r12, [r3], lr + add r0, r0, #16 + add r1, r1, r2 ; pred + pitch + + bcs loop2_dual + + add sp, sp, #4 ; idct_output buffer + ldmia sp!, {r4 - r11, pc} + + ENDP + + END diff --git a/vp8/common/arm/armv6/intra4x4_predict_v6.asm b/vp8/common/arm/armv6/intra4x4_predict_v6.asm new file mode 100644 index 0000000000000000000000000000000000000000..c5ec824b3409bbe5ccbc0400f78c5db7b39c9b36 --- /dev/null +++ b/vp8/common/arm/armv6/intra4x4_predict_v6.asm @@ -0,0 +1,611 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_intra4x4_predict_armv6| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + +;void vp8_intra4x4_predict_armv6(unsigned char *Above, unsigned char *yleft, +; B_PREDICTION_MODE left_stride, int b_mode, +; unsigned char *dst, int dst_stride, +; unsigned char top_left) + +; r0: *Above +; r1: *yleft +; r2: left_stride +; r3: b_mode +; sp + #40: dst +; sp + #44: dst_stride +; sp + #48: top_left +|vp8_intra4x4_predict_armv6| PROC + push {r4-r12, lr} + + cmp r3, #10 + addlt pc, pc, r3, lsl #2 ; position independent switch + pop {r4-r12, pc} ; default + b b_dc_pred + b b_tm_pred + b b_ve_pred + b b_he_pred + b b_ld_pred + b b_rd_pred + b b_vr_pred + b b_vl_pred + b b_hd_pred + b b_hu_pred + +b_dc_pred + ; load values + ldr r8, [r0] ; Above + ldrb r4, [r1], r2 ; Left[0] + mov r9, #0 + ldrb r5, [r1], r2 ; Left[1] + ldrb r6, [r1], r2 ; Left[2] + usad8 r12, r8, r9 + ldrb r7, [r1] ; Left[3] + + ; calculate dc + add r4, r4, r5 + add r4, r4, r6 + add r4, r4, r7 + add r4, r4, r12 + add r4, r4, #4 + ldr r0, [sp, #44] ; dst_stride + mov r12, r4, asr #3 ; (expected_dc + 4) >> 3 + + add r12, r12, r12, lsl #8 + ldr r3, [sp, #40] ; dst + add r12, r12, r12, lsl #16 + + ; store values + str r12, [r3], r0 + str r12, [r3], r0 + str r12, [r3], r0 + str r12, [r3] + + pop {r4-r12, pc} + +b_tm_pred + ldr r8, [r0] ; Above + ldrb r9, [sp, #48] ; top_left + ldrb r4, [r1], r2 ; Left[0] + ldrb r5, [r1], r2 ; Left[1] + ldrb r6, [r1], r2 ; Left[2] + ldrb r7, [r1] ; Left[3] + ldr r0, [sp, #44] ; dst_stride + ldr r3, [sp, #40] ; dst + + add r9, r9, r9, lsl #16 ; [tl|tl] + uxtb16 r10, r8 ; a[2|0] + uxtb16 r11, r8, ror #8 ; a[3|1] + ssub16 r10, r10, r9 ; a[2|0] - [tl|tl] + ssub16 r11, r11, r9 ; a[3|1] - [tl|tl] + + add r4, r4, r4, lsl #16 ; l[0|0] + add r5, r5, r5, lsl #16 ; l[1|1] + add r6, r6, r6, lsl #16 ; l[2|2] + add r7, r7, r7, lsl #16 ; l[3|3] + + sadd16 r1, r4, r10 ; l[0|0] + a[2|0] - [tl|tl] + sadd16 r2, r4, r11 ; l[0|0] + a[3|1] - [tl|tl] + usat16 r1, #8, r1 + usat16 r2, #8, r2 + + sadd16 r4, r5, r10 ; l[1|1] + a[2|0] - [tl|tl] + sadd16 r5, r5, r11 ; l[1|1] + a[3|1] - [tl|tl] + + add r12, r1, r2, lsl #8 ; [3|2|1|0] + str r12, [r3], r0 + + usat16 r4, #8, r4 + usat16 r5, #8, r5 + + sadd16 r1, r6, r10 ; l[2|2] + a[2|0] - [tl|tl] + sadd16 r2, r6, r11 ; l[2|2] + a[3|1] - [tl|tl] + + add r12, r4, r5, lsl #8 ; [3|2|1|0] + str r12, [r3], r0 + + usat16 r1, #8, r1 + usat16 r2, #8, r2 + + sadd16 r4, r7, r10 ; l[3|3] + a[2|0] - [tl|tl] + sadd16 r5, r7, r11 ; l[3|3] + a[3|1] - [tl|tl] + + add r12, r1, r2, lsl #8 ; [3|2|1|0] + + usat16 r4, #8, r4 + usat16 r5, #8, r5 + + str r12, [r3], r0 + + add r12, r4, r5, lsl #8 ; [3|2|1|0] + str r12, [r3] + + pop {r4-r12, pc} + +b_ve_pred + ldr r8, [r0] ; a[3|2|1|0] + ldr r11, c00FF00FF + ldrb r9, [sp, #48] ; top_left + ldrb r10, [r0, #4] ; a[4] + + ldr r0, c00020002 + + uxtb16 r4, r8 ; a[2|0] + uxtb16 r5, r8, ror #8 ; a[3|1] + ldr r2, [sp, #44] ; dst_stride + pkhbt r9, r9, r5, lsl #16 ; a[1|-1] + + add r9, r9, r4, lsl #1 ;[a[1]+2*a[2] | tl+2*a[0] ] + uxtab16 r9, r9, r5 ;[a[1]+2*a[2]+a[3] | tl+2*a[0]+a[1] ] + ldr r3, [sp, #40] ; dst + uxtab16 r9, r9, r0 ;[a[1]+2*a[2]+a[3]+2| tl+2*a[0]+a[1]+2] + + add r0, r0, r10, lsl #16 ;[a[4]+2 | 2] + add r0, r0, r4, asr #16 ;[a[4]+2 | a[2]+2] + add r0, r0, r5, lsl #1 ;[a[4]+2*a[3]+2 | a[2]+2*a[1]+2] + uadd16 r4, r4, r0 ;[a[4]+2*a[3]+a[2]+2|a[2]+2*a[1]+a[0]+2] + + and r9, r11, r9, asr #2 + and r4, r11, r4, asr #2 + add r9, r9, r4, lsl #8 + + ; store values + str r9, [r3], r2 + str r9, [r3], r2 + str r9, [r3], r2 + str r9, [r3] + + pop {r4-r12, pc} + + +b_he_pred + ldrb r4, [r1], r2 ; Left[0] + ldrb r8, [sp, #48] ; top_left + ldrb r5, [r1], r2 ; Left[1] + ldrb r6, [r1], r2 ; Left[2] + ldrb r7, [r1] ; Left[3] + + add r8, r8, r4 ; tl + l[0] + add r9, r4, r5 ; l[0] + l[1] + add r10, r5, r6 ; l[1] + l[2] + add r11, r6, r7 ; l[2] + l[3] + + mov r0, #2<<14 + + add r8, r8, r9 ; tl + 2*l[0] + l[1] + add r4, r9, r10 ; l[0] + 2*l[1] + l[2] + add r5, r10, r11 ; l[1] + 2*l[2] + l[3] + add r6, r11, r7, lsl #1 ; l[2] + 2*l[3] + l[3] + + + add r8, r0, r8, lsl #14 ; (tl + 2*l[0] + l[1])>>2 in top half + add r9, r0, r4, lsl #14 ; (l[0] + 2*l[1] + l[2])>>2 in top half + add r10,r0, r5, lsl #14 ; (l[1] + 2*l[2] + l[3])>>2 in top half + add r11,r0, r6, lsl #14 ; (l[2] + 2*l[3] + l[3])>>2 in top half + + pkhtb r8, r8, r8, asr #16 ; l[-|0|-|0] + pkhtb r9, r9, r9, asr #16 ; l[-|1|-|1] + pkhtb r10, r10, r10, asr #16 ; l[-|2|-|2] + pkhtb r11, r11, r11, asr #16 ; l[-|3|-|3] + + ldr r0, [sp, #44] ; dst_stride + ldr r3, [sp, #40] ; dst + + add r8, r8, r8, lsl #8 ; l[0|0|0|0] + add r9, r9, r9, lsl #8 ; l[1|1|1|1] + add r10, r10, r10, lsl #8 ; l[2|2|2|2] + add r11, r11, r11, lsl #8 ; l[3|3|3|3] + + ; store values + str r8, [r3], r0 + str r9, [r3], r0 + str r10, [r3], r0 + str r11, [r3] + + pop {r4-r12, pc} + +b_ld_pred + ldr r4, [r0] ; Above[0-3] + ldr r12, c00020002 + ldr r5, [r0, #4] ; Above[4-7] + ldr lr, c00FF00FF + + uxtb16 r6, r4 ; a[2|0] + uxtb16 r7, r4, ror #8 ; a[3|1] + uxtb16 r8, r5 ; a[6|4] + uxtb16 r9, r5, ror #8 ; a[7|5] + pkhtb r10, r6, r8 ; a[2|4] + pkhtb r11, r7, r9 ; a[3|5] + + add r4, r6, r7, lsl #1 ; [a2+2*a3 | a0+2*a1] + add r4, r4, r10, ror #16 ; [a2+2*a3+a4 | a0+2*a1+a2] + uxtab16 r4, r4, r12 ; [a2+2*a3+a4+2 | a0+2*a1+a2+2] + + add r5, r7, r10, ror #15 ; [a3+2*a4 | a1+2*a2] + add r5, r5, r11, ror #16 ; [a3+2*a4+a5 | a1+2*a2+a3] + uxtab16 r5, r5, r12 ; [a3+2*a4+a5+2 | a1+2*a2+a3+2] + + pkhtb r7, r9, r8, asr #16 + add r6, r8, r9, lsl #1 ; [a6+2*a7 | a4+2*a5] + uadd16 r6, r6, r7 ; [a6+2*a7+a7 | a4+2*a5+a6] + uxtab16 r6, r6, r12 ; [a6+2*a7+a7+2 | a4+2*a5+a6+2] + + uxth r7, r9 ; [ a5] + add r7, r7, r8, asr #15 ; [ a5+2*a6] + add r7, r7, r9, asr #16 ; [ a5+2*a6+a7] + uxtah r7, r7, r12 ; [ a5+2*a6+a7+2] + + ldr r0, [sp, #44] ; dst_stride + ldr r3, [sp, #40] ; dst + + ; scale down + and r4, lr, r4, asr #2 + and r5, lr, r5, asr #2 + and r6, lr, r6, asr #2 + mov r7, r7, asr #2 + + add r8, r4, r5, lsl #8 ; [3|2|1|0] + str r8, [r3], r0 + + mov r9, r8, lsr #8 + add r9, r9, r6, lsl #24 ; [4|3|2|1] + str r9, [r3], r0 + + mov r10, r9, lsr #8 + add r10, r10, r7, lsl #24 ; [5|4|3|2] + str r10, [r3], r0 + + mov r6, r6, lsr #16 + mov r11, r10, lsr #8 + add r11, r11, r6, lsl #24 ; [6|5|4|3] + str r11, [r3] + + pop {r4-r12, pc} + +b_rd_pred + ldrb r7, [r1], r2 ; l[0] = pp[3] + ldr lr, [r0] ; Above = pp[8|7|6|5] + ldrb r8, [sp, #48] ; tl = pp[4] + ldrb r6, [r1], r2 ; l[1] = pp[2] + ldrb r5, [r1], r2 ; l[2] = pp[1] + ldrb r4, [r1], r2 ; l[3] = pp[0] + + + uxtb16 r9, lr ; p[7|5] + uxtb16 r10, lr, ror #8 ; p[8|6] + add r4, r4, r6, lsl #16 ; p[2|0] + add r5, r5, r7, lsl #16 ; p[3|1] + add r6, r6, r8, lsl #16 ; p[4|2] + pkhbt r7, r7, r9, lsl #16 ; p[5|3] + pkhbt r8, r8, r10, lsl #16 ; p[6|4] + + ldr r12, c00020002 + ldr lr, c00FF00FF + + add r4, r4, r5, lsl #1 ; [p2+2*p3 | p0+2*p1] + add r4, r4, r6 ; [p2+2*p3+p4 | p0+2*p1+p2] + uxtab16 r4, r4, r12 ; [p2+2*p3+p4+2 | p0+2*p1+p2+2] + + add r5, r5, r6, lsl #1 ; [p3+2*p4 | p1+2*p2] + add r5, r5, r7 ; [p3+2*p4+p5 | p1+2*p2+p3] + uxtab16 r5, r5, r12 ; [p3+2*p4+p5+2 | p1+2*p2+p3+2] + + add r6, r7, r8, lsl #1 ; [p5+2*p6 | p3+2*p4] + add r6, r6, r9 ; [p5+2*p6+p7 | p3+2*p4+p5] + uxtab16 r6, r6, r12 ; [p5+2*p6+p7+2 | p3+2*p4+p5+2] + + add r7, r8, r9, lsl #1 ; [p6+2*p7 | p4+2*p5] + add r7, r7, r10 ; [p6+2*p7+p8 | p4+2*p5+p6] + uxtab16 r7, r7, r12 ; [p6+2*p7+p8+2 | p4+2*p5+p6+2] + + ldr r0, [sp, #44] ; dst_stride + ldr r3, [sp, #40] ; dst + + ; scale down + and r7, lr, r7, asr #2 + and r6, lr, r6, asr #2 + and r5, lr, r5, asr #2 + and r4, lr, r4, asr #2 + + add r8, r6, r7, lsl #8 ; [6|5|4|3] + str r8, [r3], r0 + + mov r9, r8, lsl #8 ; [5|4|3|-] + uxtab r9, r9, r4, ror #16 ; [5|4|3|2] + str r9, [r3], r0 + + mov r10, r9, lsl #8 ; [4|3|2|-] + uxtab r10, r10, r5 ; [4|3|2|1] + str r10, [r3], r0 + + mov r11, r10, lsl #8 ; [3|2|1|-] + uxtab r11, r11, r4 ; [3|2|1|0] + str r11, [r3] + + pop {r4-r12, pc} + +b_vr_pred + ldrb r7, [r1], r2 ; l[0] = pp[3] + ldr lr, [r0] ; Above = pp[8|7|6|5] + ldrb r8, [sp, #48] ; tl = pp[4] + ldrb r6, [r1], r2 ; l[1] = pp[2] + ldrb r5, [r1], r2 ; l[2] = pp[1] + ldrb r4, [r1] ; l[3] = pp[0] + + add r5, r5, r7, lsl #16 ; p[3|1] + add r6, r6, r8, lsl #16 ; p[4|2] + uxtb16 r9, lr ; p[7|5] + uxtb16 r10, lr, ror #8 ; p[8|6] + pkhbt r7, r7, r9, lsl #16 ; p[5|3] + pkhbt r8, r8, r10, lsl #16 ; p[6|4] + + ldr r4, c00010001 + ldr r12, c00020002 + ldr lr, c00FF00FF + + add r5, r5, r6, lsl #1 ; [p3+2*p4 | p1+2*p2] + add r5, r5, r7 ; [p3+2*p4+p5 | p1+2*p2+p3] + uxtab16 r5, r5, r12 ; [p3+2*p4+p5+2 | p1+2*p2+p3+2] + + add r6, r6, r7, lsl #1 ; [p4+2*p5 | p2+2*p3] + add r6, r6, r8 ; [p4+2*p5+p6 | p2+2*p3+p4] + uxtab16 r6, r6, r12 ; [p4+2*p5+p6+2 | p2+2*p3+p4+2] + + uadd16 r11, r8, r9 ; [p6+p7 | p4+p5] + uhadd16 r11, r11, r4 ; [(p6+p7+1)>>1 | (p4+p5+1)>>1] + ; [F|E] + + add r7, r7, r8, lsl #1 ; [p5+2*p6 | p3+2*p4] + add r7, r7, r9 ; [p5+2*p6+p7 | p3+2*p4+p5] + uxtab16 r7, r7, r12 ; [p5+2*p6+p7+2 | p3+2*p4+p5+2] + + uadd16 r2, r9, r10 ; [p7+p8 | p5+p6] + uhadd16 r2, r2, r4 ; [(p7+p8+1)>>1 | (p5+p6+1)>>1] + ; [J|I] + + add r8, r8, r9, lsl #1 ; [p6+2*p7 | p4+2*p5] + add r8, r8, r10 ; [p6+2*p7+p8 | p4+2*p5+p6] + uxtab16 r8, r8, r12 ; [p6+2*p7+p8+2 | p4+2*p5+p6+2] + + ldr r0, [sp, #44] ; dst_stride + ldr r3, [sp, #40] ; dst + + ; scale down + and r5, lr, r5, asr #2 ; [B|A] + and r6, lr, r6, asr #2 ; [D|C] + and r7, lr, r7, asr #2 ; [H|G] + and r8, lr, r8, asr #2 ; [L|K] + + add r12, r11, r2, lsl #8 ; [J|F|I|E] + str r12, [r3], r0 + + add r12, r7, r8, lsl #8 ; [L|H|K|G] + str r12, [r3], r0 + + pkhbt r2, r6, r2, lsl #16 ; [-|I|-|C] + add r2, r2, r11, lsl #8 ; [F|I|E|C] + + pkhtb r12, r6, r5 ; [-|D|-|A] + pkhtb r10, r7, r5, asr #16 ; [-|H|-|B] + str r2, [r3], r0 + add r12, r12, r10, lsl #8 ; [H|D|B|A] + str r12, [r3] + + pop {r4-r12, pc} + +b_vl_pred + ldr r4, [r0] ; [3|2|1|0] = Above[0-3] + ldr r12, c00020002 + ldr r5, [r0, #4] ; [7|6|5|4] = Above[4-7] + ldr lr, c00FF00FF + ldr r2, c00010001 + + mov r0, r4, lsr #16 ; [-|-|3|2] + add r0, r0, r5, lsl #16 ; [5|4|3|2] + uxtb16 r6, r4 ; [2|0] + uxtb16 r7, r4, ror #8 ; [3|1] + uxtb16 r8, r0 ; [4|2] + uxtb16 r9, r0, ror #8 ; [5|3] + uxtb16 r10, r5 ; [6|4] + uxtb16 r11, r5, ror #8 ; [7|5] + + uadd16 r4, r6, r7 ; [p2+p3 | p0+p1] + uhadd16 r4, r4, r2 ; [(p2+p3+1)>>1 | (p0+p1+1)>>1] + ; [B|A] + + add r5, r6, r7, lsl #1 ; [p2+2*p3 | p0+2*p1] + add r5, r5, r8 ; [p2+2*p3+p4 | p0+2*p1+p2] + uxtab16 r5, r5, r12 ; [p2+2*p3+p4+2 | p0+2*p1+p2+2] + + uadd16 r6, r7, r8 ; [p3+p4 | p1+p2] + uhadd16 r6, r6, r2 ; [(p3+p4+1)>>1 | (p1+p2+1)>>1] + ; [F|E] + + add r7, r7, r8, lsl #1 ; [p3+2*p4 | p1+2*p2] + add r7, r7, r9 ; [p3+2*p4+p5 | p1+2*p2+p3] + uxtab16 r7, r7, r12 ; [p3+2*p4+p5+2 | p1+2*p2+p3+2] + + add r8, r8, r9, lsl #1 ; [p4+2*p5 | p2+2*p3] + add r8, r8, r10 ; [p4+2*p5+p6 | p2+2*p3+p4] + uxtab16 r8, r8, r12 ; [p4+2*p5+p6+2 | p2+2*p3+p4+2] + + add r9, r9, r10, lsl #1 ; [p5+2*p6 | p3+2*p4] + add r9, r9, r11 ; [p5+2*p6+p7 | p3+2*p4+p5] + uxtab16 r9, r9, r12 ; [p5+2*p6+p7+2 | p3+2*p4+p5+2] + + ldr r0, [sp, #44] ; dst_stride + ldr r3, [sp, #40] ; dst + + ; scale down + and r5, lr, r5, asr #2 ; [D|C] + and r7, lr, r7, asr #2 ; [H|G] + and r8, lr, r8, asr #2 ; [I|D] + and r9, lr, r9, asr #2 ; [J|H] + + add r10, r4, r6, lsl #8 ; [F|B|E|A] + str r10, [r3], r0 + + add r5, r5, r7, lsl #8 ; [H|C|G|D] + str r5, [r3], r0 + + pkhtb r12, r8, r4, asr #16 ; [-|I|-|B] + pkhtb r10, r9, r8 ; [-|J|-|D] + + add r12, r6, r12, lsl #8 ; [I|F|B|E] + str r12, [r3], r0 + + add r10, r7, r10, lsl #8 ; [J|H|D|G] + str r10, [r3] + + pop {r4-r12, pc} + +b_hd_pred + ldrb r7, [r1], r2 ; l[0] = pp[3] + ldr lr, [r0] ; Above = pp[8|7|6|5] + ldrb r8, [sp, #48] ; tl = pp[4] + ldrb r6, [r1], r2 ; l[1] = pp[2] + ldrb r5, [r1], r2 ; l[2] = pp[1] + ldrb r4, [r1] ; l[3] = pp[0] + + uxtb16 r9, lr ; p[7|5] + uxtb16 r10, lr, ror #8 ; p[8|6] + + add r4, r4, r5, lsl #16 ; p[1|0] + add r5, r5, r6, lsl #16 ; p[2|1] + add r6, r6, r7, lsl #16 ; p[3|2] + add r7, r7, r8, lsl #16 ; p[4|3] + + ldr r12, c00020002 + ldr lr, c00FF00FF + ldr r2, c00010001 + + pkhtb r8, r7, r9 ; p[4|5] + pkhtb r1, r9, r10 ; p[7|6] + pkhbt r10, r8, r10, lsl #16 ; p[6|5] + + uadd16 r11, r4, r5 ; [p1+p2 | p0+p1] + uhadd16 r11, r11, r2 ; [(p1+p2+1)>>1 | (p0+p1+1)>>1] + ; [B|A] + + add r4, r4, r5, lsl #1 ; [p1+2*p2 | p0+2*p1] + add r4, r4, r6 ; [p1+2*p2+p3 | p0+2*p1+p2] + uxtab16 r4, r4, r12 ; [p1+2*p2+p3+2 | p0+2*p1+p2+2] + + uadd16 r0, r6, r7 ; [p3+p4 | p2+p3] + uhadd16 r0, r0, r2 ; [(p3+p4+1)>>1 | (p2+p3+1)>>1] + ; [F|E] + + add r5, r6, r7, lsl #1 ; [p3+2*p4 | p2+2*p3] + add r5, r5, r8, ror #16 ; [p3+2*p4+p5 | p2+2*p3+p4] + uxtab16 r5, r5, r12 ; [p3+2*p4+p5+2 | p2+2*p3+p4+2] + + add r6, r12, r8, ror #16 ; [p5+2 | p4+2] + add r6, r6, r10, lsl #1 ; [p5+2+2*p6 | p4+2+2*p5] + uxtab16 r6, r6, r1 ; [p5+2+2*p6+p7 | p4+2+2*p5+p6] + + ; scale down + and r4, lr, r4, asr #2 ; [D|C] + and r5, lr, r5, asr #2 ; [H|G] + and r6, lr, r6, asr #2 ; [J|I] + + ldr lr, [sp, #44] ; dst_stride + ldr r3, [sp, #40] ; dst + + pkhtb r2, r0, r6 ; [-|F|-|I] + pkhtb r12, r6, r5, asr #16 ; [-|J|-|H] + add r12, r12, r2, lsl #8 ; [F|J|I|H] + add r2, r0, r5, lsl #8 ; [H|F|G|E] + mov r12, r12, ror #24 ; [J|I|H|F] + str r12, [r3], lr + + mov r7, r11, asr #16 ; [-|-|-|B] + str r2, [r3], lr + add r7, r7, r0, lsl #16 ; [-|E|-|B] + add r7, r7, r4, asr #8 ; [-|E|D|B] + add r7, r7, r5, lsl #24 ; [G|E|D|B] + str r7, [r3], lr + + add r5, r11, r4, lsl #8 ; [D|B|C|A] + str r5, [r3] + + pop {r4-r12, pc} + + + +b_hu_pred + ldrb r4, [r1], r2 ; Left[0] + ldr r12, c00020002 + ldrb r5, [r1], r2 ; Left[1] + ldr lr, c00FF00FF + ldrb r6, [r1], r2 ; Left[2] + ldr r2, c00010001 + ldrb r7, [r1] ; Left[3] + + add r4, r4, r5, lsl #16 ; [1|0] + add r5, r5, r6, lsl #16 ; [2|1] + add r9, r6, r7, lsl #16 ; [3|2] + + uadd16 r8, r4, r5 ; [p1+p2 | p0+p1] + uhadd16 r8, r8, r2 ; [(p1+p2+1)>>1 | (p0+p1+1)>>1] + ; [B|A] + + add r4, r4, r5, lsl #1 ; [p1+2*p2 | p0+2*p1] + add r4, r4, r9 ; [p1+2*p2+p3 | p0+2*p1+p2] + uxtab16 r4, r4, r12 ; [p1+2*p2+p3+2 | p0+2*p1+p2+2] + ldr r2, [sp, #44] ; dst_stride + ldr r3, [sp, #40] ; dst + and r4, lr, r4, asr #2 ; [D|C] + + add r10, r6, r7 ; [p2+p3] + add r11, r10, r7, lsl #1 ; [p2+3*p3] + add r10, r10, #1 + add r11, r11, #2 + mov r10, r10, asr #1 ; [E] + mov r11, r11, asr #2 ; [F] + + add r9, r7, r9, asr #8 ; [-|-|G|G] + add r0, r8, r4, lsl #8 ; [D|B|C|A] + add r7, r9, r9, lsl #16 ; [G|G|G|G] + + str r0, [r3], r2 + + mov r1, r8, asr #16 ; [-|-|-|B] + add r1, r1, r4, asr #8 ; [-|-|D|B] + add r1, r1, r10, lsl #16 ; [-|E|D|B] + add r1, r1, r11, lsl #24 ; [F|E|D|B] + str r1, [r3], r2 + + add r10, r11, lsl #8 ; [-|-|F|E] + add r10, r10, r9, lsl #16 ; [G|G|F|E] + str r10, [r3], r2 + + str r7, [r3] + + pop {r4-r12, pc} + + ENDP + +; constants +c00010001 + DCD 0x00010001 +c00020002 + DCD 0x00020002 +c00FF00FF + DCD 0x00FF00FF + + END diff --git a/vp8/common/arm/armv6/iwalsh_v6.asm b/vp8/common/arm/armv6/iwalsh_v6.asm new file mode 100644 index 0000000000000000000000000000000000000000..31ef09cada3f1838d82d54de98244cd41d486b2b --- /dev/null +++ b/vp8/common/arm/armv6/iwalsh_v6.asm @@ -0,0 +1,136 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vp8_short_inv_walsh4x4_v6| + + ARM + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE, READONLY ; name this block of code + +;short vp8_short_inv_walsh4x4_v6(short *input, short *mb_dqcoeff) +|vp8_short_inv_walsh4x4_v6| PROC + + stmdb sp!, {r4 - r12, lr} + + ldr r2, [r0, #0] ; [1 | 0] + ldr r3, [r0, #4] ; [3 | 2] + ldr r4, [r0, #8] ; [5 | 4] + ldr r5, [r0, #12] ; [7 | 6] + ldr r6, [r0, #16] ; [9 | 8] + ldr r7, [r0, #20] ; [11 | 10] + ldr r8, [r0, #24] ; [13 | 12] + ldr r9, [r0, #28] ; [15 | 14] + + qadd16 r10, r2, r8 ; a1 [1+13 | 0+12] + qadd16 r11, r4, r6 ; b1 [5+9 | 4+8] + qsub16 r12, r4, r6 ; c1 [5-9 | 4-8] + qsub16 lr, r2, r8 ; d1 [1-13 | 0-12] + + qadd16 r2, r10, r11 ; a1 + b1 [1 | 0] + qadd16 r4, r12, lr ; c1 + d1 [5 | 4] + qsub16 r6, r10, r11 ; a1 - b1 [9 | 8] + qsub16 r8, lr, r12 ; d1 - c1 [13 | 12] + + qadd16 r10, r3, r9 ; a1 [3+15 | 2+14] + qadd16 r11, r5, r7 ; b1 [7+11 | 6+10] + qsub16 r12, r5, r7 ; c1 [7-11 | 6-10] + qsub16 lr, r3, r9 ; d1 [3-15 | 2-14] + + qadd16 r3, r10, r11 ; a1 + b1 [3 | 2] + qadd16 r5, r12, lr ; c1 + d1 [7 | 6] + qsub16 r7, r10, r11 ; a1 - b1 [11 | 10] + qsub16 r9, lr, r12 ; d1 - c1 [15 | 14] + + ; first transform complete + + qsubaddx r10, r2, r3 ; [c1|a1] [1-2 | 0+3] + qaddsubx r11, r2, r3 ; [b1|d1] [1+2 | 0-3] + qsubaddx r12, r4, r5 ; [c1|a1] [5-6 | 4+7] + qaddsubx lr, r4, r5 ; [b1|d1] [5+6 | 4-7] + + qaddsubx r2, r10, r11 ; [b2|c2] [c1+d1 | a1-b1] + qaddsubx r3, r11, r10 ; [a2|d2] [b1+a1 | d1-c1] + ldr r10, c0x00030003 + qaddsubx r4, r12, lr ; [b2|c2] [c1+d1 | a1-b1] + qaddsubx r5, lr, r12 ; [a2|d2] [b1+a1 | d1-c1] + + qadd16 r2, r2, r10 ; [b2+3|c2+3] + qadd16 r3, r3, r10 ; [a2+3|d2+3] + qadd16 r4, r4, r10 ; [b2+3|c2+3] + qadd16 r5, r5, r10 ; [a2+3|d2+3] + + asr r12, r3, #19 ; [0] + strh r12, [r1], #32 + asr lr, r2, #19 ; [1] + strh lr, [r1], #32 + sxth r2, r2 + sxth r3, r3 + asr r2, r2, #3 ; [2] + strh r2, [r1], #32 + asr r3, r3, #3 ; [3] + strh r3, [r1], #32 + + asr r12, r5, #19 ; [4] + strh r12, [r1], #32 + asr lr, r4, #19 ; [5] + strh lr, [r1], #32 + sxth r4, r4 + sxth r5, r5 + asr r4, r4, #3 ; [6] + strh r4, [r1], #32 + asr r5, r5, #3 ; [7] + strh r5, [r1], #32 + + qsubaddx r2, r6, r7 ; [c1|a1] [9-10 | 8+11] + qaddsubx r3, r6, r7 ; [b1|d1] [9+10 | 8-11] + qsubaddx r4, r8, r9 ; [c1|a1] [13-14 | 12+15] + qaddsubx r5, r8, r9 ; [b1|d1] [13+14 | 12-15] + + qaddsubx r6, r2, r3 ; [b2|c2] [c1+d1 | a1-b1] + qaddsubx r7, r3, r2 ; [a2|d2] [b1+a1 | d1-c1] + qaddsubx r8, r4, r5 ; [b2|c2] [c1+d1 | a1-b1] + qaddsubx r9, r5, r4 ; [a2|d2] [b1+a1 | d1-c1] + + qadd16 r6, r6, r10 ; [b2+3|c2+3] + qadd16 r7, r7, r10 ; [a2+3|d2+3] + qadd16 r8, r8, r10 ; [b2+3|c2+3] + qadd16 r9, r9, r10 ; [a2+3|d2+3] + + asr r12, r7, #19 ; [8] + strh r12, [r1], #32 + asr lr, r6, #19 ; [9] + strh lr, [r1], #32 + sxth r6, r6 + sxth r7, r7 + asr r6, r6, #3 ; [10] + strh r6, [r1], #32 + asr r7, r7, #3 ; [11] + strh r7, [r1], #32 + + asr r12, r9, #19 ; [12] + strh r12, [r1], #32 + asr lr, r8, #19 ; [13] + strh lr, [r1], #32 + sxth r8, r8 + sxth r9, r9 + asr r8, r8, #3 ; [14] + strh r8, [r1], #32 + asr r9, r9, #3 ; [15] + strh r9, [r1], #32 + + ldmia sp!, {r4 - r12, pc} + ENDP ; |vp8_short_inv_walsh4x4_v6| + + +; Constant Pool +c0x00030003 DCD 0x00030003 + END diff --git a/vp8/common/arm/armv6/loopfilter_v6.asm b/vp8/common/arm/armv6/loopfilter_v6.asm new file mode 100644 index 0000000000000000000000000000000000000000..1cbbbcdef5e2533b43a7095b9588d07b474d939b --- /dev/null +++ b/vp8/common/arm/armv6/loopfilter_v6.asm @@ -0,0 +1,1282 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_loop_filter_horizontal_edge_armv6| + EXPORT |vp8_mbloop_filter_horizontal_edge_armv6| + EXPORT |vp8_loop_filter_vertical_edge_armv6| + EXPORT |vp8_mbloop_filter_vertical_edge_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code + + MACRO + TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3 + ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3 + ; a0: 03 02 01 00 + ; a1: 13 12 11 10 + ; a2: 23 22 21 20 + ; a3: 33 32 31 30 + ; b3 b2 b1 b0 + + uxtb16 $b1, $a1 ; xx 12 xx 10 + uxtb16 $b0, $a0 ; xx 02 xx 00 + uxtb16 $b3, $a3 ; xx 32 xx 30 + uxtb16 $b2, $a2 ; xx 22 xx 20 + orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00 + orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20 + + uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11 + uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31 + uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01 + uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21 + orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01 + orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21 + + pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1 + pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3 + + pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0 + pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2 + MEND + + +src RN r0 +pstep RN r1 +count RN r5 + +;r0 unsigned char *src_ptr, +;r1 int src_pixel_step, +;r2 const char *blimit, +;r3 const char *limit, +;stack const char *thresh, +;stack int count + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp8_loop_filter_horizontal_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + ldr count, [sp, #40] ; count for 8-in-parallel + ldr r6, [sp, #36] ; load thresh address + sub sp, sp, #16 ; create temp buffer + + ldr r9, [src], pstep ; p3 + ldrb r4, [r2] ; blimit + ldr r10, [src], pstep ; p2 + ldrb r2, [r3] ; limit + ldr r11, [src], pstep ; p1 + orr r4, r4, r4, lsl #8 + ldrb r3, [r6] ; thresh + orr r2, r2, r2, lsl #8 + mov count, count, lsl #1 ; 4-in-parallel + orr r4, r4, r4, lsl #16 + orr r3, r3, r3, lsl #8 + orr r2, r2, r2, lsl #16 + orr r3, r3, r3, lsl #16 + +|Hnext8| + ; vp8_filter_mask() function + ; calculate breakout conditions + ldr r12, [src], pstep ; p0 + + uqsub8 r6, r9, r10 ; p3 - p2 + uqsub8 r7, r10, r9 ; p2 - p3 + uqsub8 r8, r10, r11 ; p2 - p1 + uqsub8 r10, r11, r10 ; p1 - p2 + + orr r6, r6, r7 ; abs (p3-p2) + orr r8, r8, r10 ; abs (p2-p1) + uqsub8 lr, r6, r2 ; compare to limit. lr: vp8_filter_mask + uqsub8 r8, r8, r2 ; compare to limit + uqsub8 r6, r11, r12 ; p1 - p0 + orr lr, lr, r8 + uqsub8 r7, r12, r11 ; p0 - p1 + ldr r9, [src], pstep ; q0 + ldr r10, [src], pstep ; q1 + orr r6, r6, r7 ; abs (p1-p0) + uqsub8 r7, r6, r2 ; compare to limit + uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later + orr lr, lr, r7 + + uqsub8 r6, r11, r10 ; p1 - q1 + uqsub8 r7, r10, r11 ; q1 - p1 + uqsub8 r11, r12, r9 ; p0 - q0 + uqsub8 r12, r9, r12 ; q0 - p0 + orr r6, r6, r7 ; abs (p1-q1) + ldr r7, c0x7F7F7F7F + orr r12, r11, r12 ; abs (p0-q0) + ldr r11, [src], pstep ; q2 + uqadd8 r12, r12, r12 ; abs (p0-q0) * 2 + and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2 + uqsub8 r7, r9, r10 ; q0 - q1 + uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 + uqsub8 r6, r10, r9 ; q1 - q0 + uqsub8 r12, r12, r4 ; compare to flimit + uqsub8 r9, r11, r10 ; q2 - q1 + + orr lr, lr, r12 + + ldr r12, [src], pstep ; q3 + uqsub8 r10, r10, r11 ; q1 - q2 + orr r6, r7, r6 ; abs (q1-q0) + orr r10, r9, r10 ; abs (q2-q1) + uqsub8 r7, r6, r2 ; compare to limit + uqsub8 r10, r10, r2 ; compare to limit + uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later + orr lr, lr, r7 + orr lr, lr, r10 + + uqsub8 r10, r12, r11 ; q3 - q2 + uqsub8 r9, r11, r12 ; q2 - q3 + + mvn r11, #0 ; r11 == -1 + + orr r10, r10, r9 ; abs (q3-q2) + uqsub8 r10, r10, r2 ; compare to limit + + mov r12, #0 + orr lr, lr, r10 + sub src, src, pstep, lsl #2 + + usub8 lr, r12, lr ; use usub8 instead of ssub8 + sel lr, r11, r12 ; filter mask: lr + + cmp lr, #0 + beq hskip_filter ; skip filtering + + sub src, src, pstep, lsl #1 ; move src pointer down by 6 lines + + ;vp8_hevmask() function + ;calculate high edge variance + orr r10, r6, r8 ; calculate vp8_hevmask + + ldr r7, [src], pstep ; p1 + + usub8 r10, r12, r10 ; use usub8 instead of ssub8 + sel r6, r12, r11 ; obtain vp8_hevmask: r6 + + ;vp8_filter() function + ldr r8, [src], pstep ; p0 + ldr r12, c0x80808080 + ldr r9, [src], pstep ; q0 + ldr r10, [src], pstep ; q1 + + eor r7, r7, r12 ; p1 offset to convert to a signed value + eor r8, r8, r12 ; p0 offset to convert to a signed value + eor r9, r9, r12 ; q0 offset to convert to a signed value + eor r10, r10, r12 ; q1 offset to convert to a signed value + + str r9, [sp] ; store qs0 temporarily + str r8, [sp, #4] ; store ps0 temporarily + str r10, [sp, #8] ; store qs1 temporarily + str r7, [sp, #12] ; store ps1 temporarily + + qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) + qsub8 r8, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + + and r7, r7, r6 ; vp8_filter (r7) &= hev + + qadd8 r7, r7, r8 + ldr r9, c0x03030303 ; r9 = 3 --modified for vp8 + + qadd8 r7, r7, r8 + ldr r10, c0x04040404 + + qadd8 r7, r7, r8 + and r7, r7, lr ; vp8_filter &= mask; + + ;modify code for vp8 -- Filter1 = vp8_filter (r7) + qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3) + qadd8 r7 , r7 , r10 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4) + + mov r9, #0 + shadd8 r8 , r8 , r9 ; Filter2 >>= 3 + shadd8 r7 , r7 , r9 ; vp8_filter >>= 3 + shadd8 r8 , r8 , r9 + shadd8 r7 , r7 , r9 + shadd8 lr , r8 , r9 ; lr: Filter2 + shadd8 r7 , r7 , r9 ; r7: filter + + ;usub8 lr, r8, r10 ; s = (s==4)*-1 + ;sel lr, r11, r9 + ;usub8 r8, r10, r8 + ;sel r8, r11, r9 + ;and r8, r8, lr ; -1 for each element that equals 4 + + ;calculate output + ;qadd8 lr, r8, r7 ; u = vp8_signed_char_clamp(s + vp8_filter) + + ldr r8, [sp] ; load qs0 + ldr r9, [sp, #4] ; load ps0 + + ldr r10, c0x01010101 + + qsub8 r8 ,r8, r7 ; u = vp8_signed_char_clamp(qs0 - vp8_filter) + qadd8 r9, r9, lr ; u = vp8_signed_char_clamp(ps0 + Filter2) + + ;end of modification for vp8 + + mov lr, #0 + sadd8 r7, r7 , r10 ; vp8_filter += 1 + shadd8 r7, r7, lr ; vp8_filter >>= 1 + + ldr r11, [sp, #12] ; load ps1 + ldr r10, [sp, #8] ; load qs1 + + bic r7, r7, r6 ; vp8_filter &= ~hev + sub src, src, pstep, lsl #2 + + qadd8 r11, r11, r7 ; u = vp8_signed_char_clamp(ps1 + vp8_filter) + qsub8 r10, r10,r7 ; u = vp8_signed_char_clamp(qs1 - vp8_filter) + + eor r11, r11, r12 ; *op1 = u^0x80 + str r11, [src], pstep ; store op1 + eor r9, r9, r12 ; *op0 = u^0x80 + str r9, [src], pstep ; store op0 result + eor r8, r8, r12 ; *oq0 = u^0x80 + str r8, [src], pstep ; store oq0 result + eor r10, r10, r12 ; *oq1 = u^0x80 + str r10, [src], pstep ; store oq1 + + sub src, src, pstep, lsl #1 + +|hskip_filter| + add src, src, #4 + sub src, src, pstep, lsl #2 + + subs count, count, #1 + + ldrne r9, [src], pstep ; p3 + ldrne r10, [src], pstep ; p2 + ldrne r11, [src], pstep ; p1 + + bne Hnext8 + + add sp, sp, #16 + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_loop_filter_horizontal_edge_armv6| + + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp8_mbloop_filter_horizontal_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + ldr count, [sp, #40] ; count for 8-in-parallel + ldr r6, [sp, #36] ; load thresh address + sub sp, sp, #16 ; create temp buffer + + ldr r9, [src], pstep ; p3 + ldrb r4, [r2] ; blimit + ldr r10, [src], pstep ; p2 + ldrb r2, [r3] ; limit + ldr r11, [src], pstep ; p1 + orr r4, r4, r4, lsl #8 + ldrb r3, [r6] ; thresh + orr r2, r2, r2, lsl #8 + mov count, count, lsl #1 ; 4-in-parallel + orr r4, r4, r4, lsl #16 + orr r3, r3, r3, lsl #8 + orr r2, r2, r2, lsl #16 + orr r3, r3, r3, lsl #16 + +|MBHnext8| + + ; vp8_filter_mask() function + ; calculate breakout conditions + ldr r12, [src], pstep ; p0 + + uqsub8 r6, r9, r10 ; p3 - p2 + uqsub8 r7, r10, r9 ; p2 - p3 + uqsub8 r8, r10, r11 ; p2 - p1 + uqsub8 r10, r11, r10 ; p1 - p2 + + orr r6, r6, r7 ; abs (p3-p2) + orr r8, r8, r10 ; abs (p2-p1) + uqsub8 lr, r6, r2 ; compare to limit. lr: vp8_filter_mask + uqsub8 r8, r8, r2 ; compare to limit + + uqsub8 r6, r11, r12 ; p1 - p0 + orr lr, lr, r8 + uqsub8 r7, r12, r11 ; p0 - p1 + ldr r9, [src], pstep ; q0 + ldr r10, [src], pstep ; q1 + orr r6, r6, r7 ; abs (p1-p0) + uqsub8 r7, r6, r2 ; compare to limit + uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later + orr lr, lr, r7 + + uqsub8 r6, r11, r10 ; p1 - q1 + uqsub8 r7, r10, r11 ; q1 - p1 + uqsub8 r11, r12, r9 ; p0 - q0 + uqsub8 r12, r9, r12 ; q0 - p0 + orr r6, r6, r7 ; abs (p1-q1) + ldr r7, c0x7F7F7F7F + orr r12, r11, r12 ; abs (p0-q0) + ldr r11, [src], pstep ; q2 + uqadd8 r12, r12, r12 ; abs (p0-q0) * 2 + and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2 + uqsub8 r7, r9, r10 ; q0 - q1 + uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 + uqsub8 r6, r10, r9 ; q1 - q0 + uqsub8 r12, r12, r4 ; compare to flimit + uqsub8 r9, r11, r10 ; q2 - q1 + + orr lr, lr, r12 + + ldr r12, [src], pstep ; q3 + + uqsub8 r10, r10, r11 ; q1 - q2 + orr r6, r7, r6 ; abs (q1-q0) + orr r10, r9, r10 ; abs (q2-q1) + uqsub8 r7, r6, r2 ; compare to limit + uqsub8 r10, r10, r2 ; compare to limit + uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later + orr lr, lr, r7 + orr lr, lr, r10 + + uqsub8 r10, r12, r11 ; q3 - q2 + uqsub8 r9, r11, r12 ; q2 - q3 + + mvn r11, #0 ; r11 == -1 + + orr r10, r10, r9 ; abs (q3-q2) + uqsub8 r10, r10, r2 ; compare to limit + + mov r12, #0 + + orr lr, lr, r10 + + usub8 lr, r12, lr ; use usub8 instead of ssub8 + sel lr, r11, r12 ; filter mask: lr + + cmp lr, #0 + beq mbhskip_filter ; skip filtering + + ;vp8_hevmask() function + ;calculate high edge variance + sub src, src, pstep, lsl #2 ; move src pointer down by 6 lines + sub src, src, pstep, lsl #1 + + orr r10, r6, r8 + ldr r7, [src], pstep ; p1 + + usub8 r10, r12, r10 + sel r6, r12, r11 ; hev mask: r6 + + ;vp8_mbfilter() function + ;p2, q2 are only needed at the end. Don't need to load them in now. + ldr r8, [src], pstep ; p0 + ldr r12, c0x80808080 + ldr r9, [src], pstep ; q0 + ldr r10, [src] ; q1 + + eor r7, r7, r12 ; ps1 + eor r8, r8, r12 ; ps0 + eor r9, r9, r12 ; qs0 + eor r10, r10, r12 ; qs1 + + qsub8 r12, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + str r7, [sp, #12] ; store ps1 temporarily + qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) + str r10, [sp, #8] ; store qs1 temporarily + qadd8 r7, r7, r12 + str r9, [sp] ; store qs0 temporarily + qadd8 r7, r7, r12 + str r8, [sp, #4] ; store ps0 temporarily + qadd8 r7, r7, r12 ; vp8_filter: r7 + + ldr r10, c0x03030303 ; r10 = 3 --modified for vp8 + ldr r9, c0x04040404 + + and r7, r7, lr ; vp8_filter &= mask (lr is free) + + mov r12, r7 ; Filter2: r12 + and r12, r12, r6 ; Filter2 &= hev + + ;modify code for vp8 + ;save bottom 3 bits so that we round one side +4 and the other +3 + qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4) + qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3) + + mov r10, #0 + shadd8 r8 , r8 , r10 ; Filter1 >>= 3 + shadd8 r12 , r12 , r10 ; Filter2 >>= 3 + shadd8 r8 , r8 , r10 + shadd8 r12 , r12 , r10 + shadd8 r8 , r8 , r10 ; r8: Filter1 + shadd8 r12 , r12 , r10 ; r12: Filter2 + + ldr r9, [sp] ; load qs0 + ldr r11, [sp, #4] ; load ps0 + + qsub8 r9 , r9, r8 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1) + qadd8 r11, r11, r12 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2) + + ;save bottom 3 bits so that we round one side +4 and the other +3 + ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8) + ;qadd8 r12 , r12 , r9 ; Filter2 = vp8_signed_char_clamp(Filter2+4) + ;mov r10, #0 + ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3 + ;usub8 lr, r8, r9 ; s = (s==4)*-1 + ;sel lr, r11, r10 + ;shadd8 r12 , r12 , r10 + ;usub8 r8, r9, r8 + ;sel r8, r11, r10 + ;ldr r9, [sp] ; load qs0 + ;ldr r11, [sp, #4] ; load ps0 + ;shadd8 r12 , r12 , r10 + ;and r8, r8, lr ; -1 for each element that equals 4 + ;qadd8 r10, r8, r12 ; u = vp8_signed_char_clamp(s + Filter2) + ;qsub8 r9 , r9, r12 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2) + ;qadd8 r11, r11, r10 ; ps0 = vp8_signed_char_clamp(ps0 + u) + + ;end of modification for vp8 + + bic r12, r7, r6 ; vp8_filter &= ~hev ( r6 is free) + ;mov r12, r7 + + ;roughly 3/7th difference across boundary + mov lr, #0x1b ; 27 + mov r7, #0x3f ; 63 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r7, r10, lr, r7 + smultb r10, r10, lr + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + add r10, r10, #63 + ssat r7, #8, r7, asr #7 + ssat r10, #8, r10, asr #7 + + ldr lr, c0x80808080 + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r7, r10, lsl #16 + uxtb16 r6, r6 + uxtb16 r10, r10 + + sub src, src, pstep + + orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) + + qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs0 - u) + qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps0 + u) + eor r8, r8, lr ; *oq0 = s^0x80 + str r8, [src] ; store *oq0 + sub src, src, pstep + eor r10, r10, lr ; *op0 = s^0x80 + str r10, [src] ; store *op0 + + ;roughly 2/7th difference across boundary + mov lr, #0x12 ; 18 + mov r7, #0x3f ; 63 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r9, r10, lr, r7 + smlatb r10, r10, lr, r7 + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + ssat r9, #8, r9, asr #7 + ssat r10, #8, r10, asr #7 + + ldr lr, c0x80808080 + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r9, r10, lsl #16 + + ldr r9, [sp, #8] ; load qs1 + ldr r11, [sp, #12] ; load ps1 + + uxtb16 r6, r6 + uxtb16 r10, r10 + + sub src, src, pstep + + orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) + + qadd8 r11, r11, r10 ; s = vp8_signed_char_clamp(ps1 + u) + qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs1 - u) + eor r11, r11, lr ; *op1 = s^0x80 + str r11, [src], pstep ; store *op1 + eor r8, r8, lr ; *oq1 = s^0x80 + add src, src, pstep, lsl #1 + + mov r7, #0x3f ; 63 + + str r8, [src], pstep ; store *oq1 + + ;roughly 1/7th difference across boundary + mov lr, #0x9 ; 9 + ldr r9, [src] ; load q2 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r12, r10, lr, r7 + smlatb r10, r10, lr, r7 + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + ssat r12, #8, r12, asr #7 + ssat r10, #8, r10, asr #7 + + sub src, src, pstep, lsl #2 + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r12, r10, lsl #16 + + sub src, src, pstep + ldr lr, c0x80808080 + + ldr r11, [src] ; load p2 + + uxtb16 r6, r6 + uxtb16 r10, r10 + + eor r9, r9, lr + eor r11, r11, lr + + orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) + + qadd8 r8, r11, r10 ; s = vp8_signed_char_clamp(ps2 + u) + qsub8 r10, r9, r10 ; s = vp8_signed_char_clamp(qs2 - u) + eor r8, r8, lr ; *op2 = s^0x80 + str r8, [src], pstep, lsl #2 ; store *op2 + add src, src, pstep + eor r10, r10, lr ; *oq2 = s^0x80 + str r10, [src], pstep, lsl #1 ; store *oq2 + +|mbhskip_filter| + add src, src, #4 + sub src, src, pstep, lsl #3 + subs count, count, #1 + + ldrne r9, [src], pstep ; p3 + ldrne r10, [src], pstep ; p2 + ldrne r11, [src], pstep ; p1 + + bne MBHnext8 + + add sp, sp, #16 + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_mbloop_filter_horizontal_edge_armv6| + + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp8_loop_filter_vertical_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + sub src, src, #4 ; move src pointer down by 4 + ldr count, [sp, #40] ; count for 8-in-parallel + ldr r12, [sp, #36] ; load thresh address + sub sp, sp, #16 ; create temp buffer + + ldr r6, [src], pstep ; load source data + ldrb r4, [r2] ; blimit + ldr r7, [src], pstep + ldrb r2, [r3] ; limit + ldr r8, [src], pstep + orr r4, r4, r4, lsl #8 + ldrb r3, [r12] ; thresh + orr r2, r2, r2, lsl #8 + ldr lr, [src], pstep + mov count, count, lsl #1 ; 4-in-parallel + orr r4, r4, r4, lsl #16 + orr r3, r3, r3, lsl #8 + orr r2, r2, r2, lsl #16 + orr r3, r3, r3, lsl #16 + +|Vnext8| + + ; vp8_filter_mask() function + ; calculate breakout conditions + ; transpose the source data for 4-in-parallel operation + TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 + + uqsub8 r7, r9, r10 ; p3 - p2 + uqsub8 r8, r10, r9 ; p2 - p3 + uqsub8 r9, r10, r11 ; p2 - p1 + uqsub8 r10, r11, r10 ; p1 - p2 + orr r7, r7, r8 ; abs (p3-p2) + orr r10, r9, r10 ; abs (p2-p1) + uqsub8 lr, r7, r2 ; compare to limit. lr: vp8_filter_mask + uqsub8 r10, r10, r2 ; compare to limit + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + orr lr, lr, r10 + + uqsub8 r6, r11, r12 ; p1 - p0 + uqsub8 r7, r12, r11 ; p0 - p1 + add src, src, #4 ; move src pointer up by 4 + orr r6, r6, r7 ; abs (p1-p0) + str r11, [sp, #12] ; save p1 + uqsub8 r10, r6, r2 ; compare to limit + uqsub8 r11, r6, r3 ; compare to thresh + orr lr, lr, r10 + + ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now + ; transpose the source data for 4-in-parallel operation + ldr r6, [src], pstep ; load source data + str r11, [sp] ; push r11 to stack + ldr r7, [src], pstep + str r12, [sp, #4] ; save current reg before load q0 - q3 data + ldr r8, [src], pstep + str lr, [sp, #8] + ldr lr, [src], pstep + + TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 + + ldr lr, [sp, #8] ; load back (f)limit accumulator + + uqsub8 r6, r12, r11 ; q3 - q2 + uqsub8 r7, r11, r12 ; q2 - q3 + uqsub8 r12, r11, r10 ; q2 - q1 + uqsub8 r11, r10, r11 ; q1 - q2 + orr r6, r6, r7 ; abs (q3-q2) + orr r7, r12, r11 ; abs (q2-q1) + uqsub8 r6, r6, r2 ; compare to limit + uqsub8 r7, r7, r2 ; compare to limit + ldr r11, [sp, #4] ; load back p0 + ldr r12, [sp, #12] ; load back p1 + orr lr, lr, r6 + orr lr, lr, r7 + + uqsub8 r6, r11, r9 ; p0 - q0 + uqsub8 r7, r9, r11 ; q0 - p0 + uqsub8 r8, r12, r10 ; p1 - q1 + uqsub8 r11, r10, r12 ; q1 - p1 + orr r6, r6, r7 ; abs (p0-q0) + ldr r7, c0x7F7F7F7F + orr r8, r8, r11 ; abs (p1-q1) + uqadd8 r6, r6, r6 ; abs (p0-q0) * 2 + and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2 + uqsub8 r11, r10, r9 ; q1 - q0 + uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 + uqsub8 r12, r9, r10 ; q0 - q1 + uqsub8 r6, r6, r4 ; compare to flimit + + orr r9, r11, r12 ; abs (q1-q0) + uqsub8 r8, r9, r2 ; compare to limit + uqsub8 r10, r9, r3 ; compare to thresh + orr lr, lr, r6 + orr lr, lr, r8 + + mvn r11, #0 ; r11 == -1 + mov r12, #0 + + usub8 lr, r12, lr + ldr r9, [sp] ; load the compared result + sel lr, r11, r12 ; filter mask: lr + + cmp lr, #0 + beq vskip_filter ; skip filtering + + ;vp8_hevmask() function + ;calculate high edge variance + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + orr r9, r9, r10 + + ldrh r7, [src, #-2] + ldrh r8, [src], pstep + + usub8 r9, r12, r9 + sel r6, r12, r11 ; hev mask: r6 + + ;vp8_filter() function + ; load soure data to r6, r11, r12, lr + ldrh r9, [src, #-2] + ldrh r10, [src], pstep + + pkhbt r12, r7, r8, lsl #16 + + ldrh r7, [src, #-2] + ldrh r8, [src], pstep + + pkhbt r11, r9, r10, lsl #16 + + ldrh r9, [src, #-2] + ldrh r10, [src], pstep + + ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first + str r6, [sp] + str lr, [sp, #4] + + pkhbt r6, r7, r8, lsl #16 + pkhbt lr, r9, r10, lsl #16 + + ;transpose r12, r11, r6, lr to r7, r8, r9, r10 + TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10 + + ;load back hev_mask r6 and filter_mask lr + ldr r12, c0x80808080 + ldr r6, [sp] + ldr lr, [sp, #4] + + eor r7, r7, r12 ; p1 offset to convert to a signed value + eor r8, r8, r12 ; p0 offset to convert to a signed value + eor r9, r9, r12 ; q0 offset to convert to a signed value + eor r10, r10, r12 ; q1 offset to convert to a signed value + + str r9, [sp] ; store qs0 temporarily + str r8, [sp, #4] ; store ps0 temporarily + str r10, [sp, #8] ; store qs1 temporarily + str r7, [sp, #12] ; store ps1 temporarily + + qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) + qsub8 r8, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + + and r7, r7, r6 ; vp8_filter (r7) &= hev (r7 : filter) + + qadd8 r7, r7, r8 + ldr r9, c0x03030303 ; r9 = 3 --modified for vp8 + + qadd8 r7, r7, r8 + ldr r10, c0x04040404 + + qadd8 r7, r7, r8 + ;mvn r11, #0 ; r11 == -1 + + and r7, r7, lr ; vp8_filter &= mask + + ;modify code for vp8 -- Filter1 = vp8_filter (r7) + qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3) + qadd8 r7 , r7 , r10 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4) + + mov r9, #0 + shadd8 r8 , r8 , r9 ; Filter2 >>= 3 + shadd8 r7 , r7 , r9 ; vp8_filter >>= 3 + shadd8 r8 , r8 , r9 + shadd8 r7 , r7 , r9 + shadd8 lr , r8 , r9 ; lr: filter2 + shadd8 r7 , r7 , r9 ; r7: filter + + ;usub8 lr, r8, r10 ; s = (s==4)*-1 + ;sel lr, r11, r9 + ;usub8 r8, r10, r8 + ;sel r8, r11, r9 + ;and r8, r8, lr ; -1 for each element that equals 4 -- r8: s + + ;calculate output + ;qadd8 lr, r8, r7 ; u = vp8_signed_char_clamp(s + vp8_filter) + + ldr r8, [sp] ; load qs0 + ldr r9, [sp, #4] ; load ps0 + + ldr r10, c0x01010101 + + qsub8 r8, r8, r7 ; u = vp8_signed_char_clamp(qs0 - vp8_filter) + qadd8 r9, r9, lr ; u = vp8_signed_char_clamp(ps0 + Filter2) + ;end of modification for vp8 + + eor r8, r8, r12 + eor r9, r9, r12 + + mov lr, #0 + + sadd8 r7, r7, r10 + shadd8 r7, r7, lr + + ldr r10, [sp, #8] ; load qs1 + ldr r11, [sp, #12] ; load ps1 + + bic r7, r7, r6 ; r7: vp8_filter + + qsub8 r10 , r10, r7 ; u = vp8_signed_char_clamp(qs1 - vp8_filter) + qadd8 r11, r11, r7 ; u = vp8_signed_char_clamp(ps1 + vp8_filter) + eor r10, r10, r12 + eor r11, r11, r12 + + sub src, src, pstep, lsl #2 + + ;we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0, p0, p1 + ;output is b0, b1, b2, b3 + ;b0: 03 02 01 00 + ;b1: 13 12 11 10 + ;b2: 23 22 21 20 + ;b3: 33 32 31 30 + ; p1 p0 q0 q1 + ; (a3 a2 a1 a0) + TRANSPOSE_MATRIX r11, r9, r8, r10, r6, r7, r12, lr + + strh r6, [src, #-2] ; store the result + mov r6, r6, lsr #16 + strh r6, [src], pstep + + strh r7, [src, #-2] + mov r7, r7, lsr #16 + strh r7, [src], pstep + + strh r12, [src, #-2] + mov r12, r12, lsr #16 + strh r12, [src], pstep + + strh lr, [src, #-2] + mov lr, lr, lsr #16 + strh lr, [src], pstep + +|vskip_filter| + sub src, src, #4 + subs count, count, #1 + + ldrne r6, [src], pstep ; load source data + ldrne r7, [src], pstep + ldrne r8, [src], pstep + ldrne lr, [src], pstep + + bne Vnext8 + + add sp, sp, #16 + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_loop_filter_vertical_edge_armv6| + + + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp8_mbloop_filter_vertical_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + sub src, src, #4 ; move src pointer down by 4 + ldr count, [sp, #40] ; count for 8-in-parallel + ldr r12, [sp, #36] ; load thresh address + pld [src, #23] ; preload for next block + sub sp, sp, #16 ; create temp buffer + + ldr r6, [src], pstep ; load source data + ldrb r4, [r2] ; blimit + pld [src, #23] + ldr r7, [src], pstep + ldrb r2, [r3] ; limit + pld [src, #23] + ldr r8, [src], pstep + orr r4, r4, r4, lsl #8 + ldrb r3, [r12] ; thresh + orr r2, r2, r2, lsl #8 + pld [src, #23] + ldr lr, [src], pstep + mov count, count, lsl #1 ; 4-in-parallel + orr r4, r4, r4, lsl #16 + orr r3, r3, r3, lsl #8 + orr r2, r2, r2, lsl #16 + orr r3, r3, r3, lsl #16 + +|MBVnext8| + ; vp8_filter_mask() function + ; calculate breakout conditions + ; transpose the source data for 4-in-parallel operation + TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 + + uqsub8 r7, r9, r10 ; p3 - p2 + uqsub8 r8, r10, r9 ; p2 - p3 + uqsub8 r9, r10, r11 ; p2 - p1 + uqsub8 r10, r11, r10 ; p1 - p2 + orr r7, r7, r8 ; abs (p3-p2) + orr r10, r9, r10 ; abs (p2-p1) + uqsub8 lr, r7, r2 ; compare to limit. lr: vp8_filter_mask + uqsub8 r10, r10, r2 ; compare to limit + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + orr lr, lr, r10 + + uqsub8 r6, r11, r12 ; p1 - p0 + uqsub8 r7, r12, r11 ; p0 - p1 + add src, src, #4 ; move src pointer up by 4 + orr r6, r6, r7 ; abs (p1-p0) + str r11, [sp, #12] ; save p1 + uqsub8 r10, r6, r2 ; compare to limit + uqsub8 r11, r6, r3 ; compare to thresh + orr lr, lr, r10 + + ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now + ; transpose the source data for 4-in-parallel operation + ldr r6, [src], pstep ; load source data + str r11, [sp] ; push r11 to stack + ldr r7, [src], pstep + str r12, [sp, #4] ; save current reg before load q0 - q3 data + ldr r8, [src], pstep + str lr, [sp, #8] + ldr lr, [src], pstep + + + TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 + + ldr lr, [sp, #8] ; load back (f)limit accumulator + + uqsub8 r6, r12, r11 ; q3 - q2 + uqsub8 r7, r11, r12 ; q2 - q3 + uqsub8 r12, r11, r10 ; q2 - q1 + uqsub8 r11, r10, r11 ; q1 - q2 + orr r6, r6, r7 ; abs (q3-q2) + orr r7, r12, r11 ; abs (q2-q1) + uqsub8 r6, r6, r2 ; compare to limit + uqsub8 r7, r7, r2 ; compare to limit + ldr r11, [sp, #4] ; load back p0 + ldr r12, [sp, #12] ; load back p1 + orr lr, lr, r6 + orr lr, lr, r7 + + uqsub8 r6, r11, r9 ; p0 - q0 + uqsub8 r7, r9, r11 ; q0 - p0 + uqsub8 r8, r12, r10 ; p1 - q1 + uqsub8 r11, r10, r12 ; q1 - p1 + orr r6, r6, r7 ; abs (p0-q0) + ldr r7, c0x7F7F7F7F + orr r8, r8, r11 ; abs (p1-q1) + uqadd8 r6, r6, r6 ; abs (p0-q0) * 2 + and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2 + uqsub8 r11, r10, r9 ; q1 - q0 + uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 + uqsub8 r12, r9, r10 ; q0 - q1 + uqsub8 r6, r6, r4 ; compare to flimit + + orr r9, r11, r12 ; abs (q1-q0) + uqsub8 r8, r9, r2 ; compare to limit + uqsub8 r10, r9, r3 ; compare to thresh + orr lr, lr, r6 + orr lr, lr, r8 + + mvn r11, #0 ; r11 == -1 + mov r12, #0 + + usub8 lr, r12, lr + ldr r9, [sp] ; load the compared result + sel lr, r11, r12 ; filter mask: lr + + cmp lr, #0 + beq mbvskip_filter ; skip filtering + + + + ;vp8_hevmask() function + ;calculate high edge variance + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + orr r9, r9, r10 + + ldrh r7, [src, #-2] + ldrh r8, [src], pstep + + usub8 r9, r12, r9 + sel r6, r12, r11 ; hev mask: r6 + + + ; vp8_mbfilter() function + ; p2, q2 are only needed at the end. Don't need to load them in now. + ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first + ; load soure data to r6, r11, r12, lr + ldrh r9, [src, #-2] + ldrh r10, [src], pstep + + pkhbt r12, r7, r8, lsl #16 + + ldrh r7, [src, #-2] + ldrh r8, [src], pstep + + pkhbt r11, r9, r10, lsl #16 + + ldrh r9, [src, #-2] + ldrh r10, [src], pstep + + str r6, [sp] ; save r6 + str lr, [sp, #4] ; save lr + + pkhbt r6, r7, r8, lsl #16 + pkhbt lr, r9, r10, lsl #16 + + ;transpose r12, r11, r6, lr to p1, p0, q0, q1 + TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10 + + ;load back hev_mask r6 and filter_mask lr + ldr r12, c0x80808080 + ldr r6, [sp] + ldr lr, [sp, #4] + + eor r7, r7, r12 ; ps1 + eor r8, r8, r12 ; ps0 + eor r9, r9, r12 ; qs0 + eor r10, r10, r12 ; qs1 + + qsub8 r12, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + str r7, [sp, #12] ; store ps1 temporarily + qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) + str r10, [sp, #8] ; store qs1 temporarily + qadd8 r7, r7, r12 + str r9, [sp] ; store qs0 temporarily + qadd8 r7, r7, r12 + str r8, [sp, #4] ; store ps0 temporarily + qadd8 r7, r7, r12 ; vp8_filter: r7 + + ldr r10, c0x03030303 ; r10 = 3 --modified for vp8 + ldr r9, c0x04040404 + ;mvn r11, #0 ; r11 == -1 + + and r7, r7, lr ; vp8_filter &= mask (lr is free) + + mov r12, r7 ; Filter2: r12 + and r12, r12, r6 ; Filter2 &= hev + + ;modify code for vp8 + ;save bottom 3 bits so that we round one side +4 and the other +3 + qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4) + qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3) + + mov r10, #0 + shadd8 r8 , r8 , r10 ; Filter1 >>= 3 + shadd8 r12 , r12 , r10 ; Filter2 >>= 3 + shadd8 r8 , r8 , r10 + shadd8 r12 , r12 , r10 + shadd8 r8 , r8 , r10 ; r8: Filter1 + shadd8 r12 , r12 , r10 ; r12: Filter2 + + ldr r9, [sp] ; load qs0 + ldr r11, [sp, #4] ; load ps0 + + qsub8 r9 , r9, r8 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1) + qadd8 r11, r11, r12 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2) + + ;save bottom 3 bits so that we round one side +4 and the other +3 + ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8) + ;qadd8 r12 , r12 , r9 ; Filter2 = vp8_signed_char_clamp(Filter2+4) + ;mov r10, #0 + ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3 + ;usub8 lr, r8, r9 ; s = (s==4)*-1 + ;sel lr, r11, r10 + ;shadd8 r12 , r12 , r10 + ;usub8 r8, r9, r8 + ;sel r8, r11, r10 + ;ldr r9, [sp] ; load qs0 + ;ldr r11, [sp, #4] ; load ps0 + ;shadd8 r12 , r12 , r10 + ;and r8, r8, lr ; -1 for each element that equals 4 + ;qadd8 r10, r8, r12 ; u = vp8_signed_char_clamp(s + Filter2) + ;qsub8 r9 , r9, r12 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2) + ;qadd8 r11, r11, r10 ; ps0 = vp8_signed_char_clamp(ps0 + u) + + ;end of modification for vp8 + + bic r12, r7, r6 ;vp8_filter &= ~hev ( r6 is free) + ;mov r12, r7 + + ;roughly 3/7th difference across boundary + mov lr, #0x1b ; 27 + mov r7, #0x3f ; 63 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r7, r10, lr, r7 + smultb r10, r10, lr + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + add r10, r10, #63 + ssat r7, #8, r7, asr #7 + ssat r10, #8, r10, asr #7 + + ldr lr, c0x80808080 + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r7, r10, lsl #16 + uxtb16 r6, r6 + uxtb16 r10, r10 + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) + + qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs0 - u) + qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps0 + u) + eor r8, r8, lr ; *oq0 = s^0x80 + eor r10, r10, lr ; *op0 = s^0x80 + + strb r10, [src, #-1] ; store op0 result + strb r8, [src], pstep ; store oq0 result + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + strb r10, [src, #-1] + strb r8, [src], pstep + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + strb r10, [src, #-1] + strb r8, [src], pstep + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + strb r10, [src, #-1] + strb r8, [src], pstep + + ;roughly 2/7th difference across boundary + mov lr, #0x12 ; 18 + mov r7, #0x3f ; 63 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r9, r10, lr, r7 + + smlatb r10, r10, lr, r7 + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + ssat r9, #8, r9, asr #7 + ssat r10, #8, r10, asr #7 + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r9, r10, lsl #16 + + ldr r9, [sp, #8] ; load qs1 + ldr r11, [sp, #12] ; load ps1 + ldr lr, c0x80808080 + + uxtb16 r6, r6 + uxtb16 r10, r10 + + add src, src, #2 + + orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) + + qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs1 - u) + qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps1 + u) + eor r8, r8, lr ; *oq1 = s^0x80 + eor r10, r10, lr ; *op1 = s^0x80 + + ldrb r11, [src, #-5] ; load p2 for 1/7th difference across boundary + strb r10, [src, #-4] ; store op1 + strb r8, [src, #-1] ; store oq1 + ldrb r9, [src], pstep ; load q2 for 1/7th difference across boundary + + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + + ldrb r6, [src, #-5] + strb r10, [src, #-4] + strb r8, [src, #-1] + ldrb r7, [src], pstep + + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + orr r11, r11, r6, lsl #8 + orr r9, r9, r7, lsl #8 + + ldrb r6, [src, #-5] + strb r10, [src, #-4] + strb r8, [src, #-1] + ldrb r7, [src], pstep + + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + orr r11, r11, r6, lsl #16 + orr r9, r9, r7, lsl #16 + + ldrb r6, [src, #-5] + strb r10, [src, #-4] + strb r8, [src, #-1] + ldrb r7, [src], pstep + orr r11, r11, r6, lsl #24 + orr r9, r9, r7, lsl #24 + + ;roughly 1/7th difference across boundary + eor r9, r9, lr + eor r11, r11, lr + + mov lr, #0x9 ; 9 + mov r7, #0x3f ; 63 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r12, r10, lr, r7 + smlatb r10, r10, lr, r7 + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + ssat r12, #8, r12, asr #7 + ssat r10, #8, r10, asr #7 + + sub src, src, pstep, lsl #2 + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r12, r10, lsl #16 + + uxtb16 r6, r6 + uxtb16 r10, r10 + + ldr lr, c0x80808080 + + orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) + + qadd8 r8, r11, r10 ; s = vp8_signed_char_clamp(ps2 + u) + qsub8 r10, r9, r10 ; s = vp8_signed_char_clamp(qs2 - u) + eor r8, r8, lr ; *op2 = s^0x80 + eor r10, r10, lr ; *oq2 = s^0x80 + + strb r8, [src, #-5] ; store *op2 + strb r10, [src], pstep ; store *oq2 + mov r8, r8, lsr #8 + mov r10, r10, lsr #8 + strb r8, [src, #-5] + strb r10, [src], pstep + mov r8, r8, lsr #8 + mov r10, r10, lsr #8 + strb r8, [src, #-5] + strb r10, [src], pstep + mov r8, r8, lsr #8 + mov r10, r10, lsr #8 + strb r8, [src, #-5] + strb r10, [src], pstep + + ;adjust src pointer for next loop + sub src, src, #2 + +|mbvskip_filter| + sub src, src, #4 + subs count, count, #1 + + pld [src, #23] ; preload for next block + ldrne r6, [src], pstep ; load source data + pld [src, #23] + ldrne r7, [src], pstep + pld [src, #23] + ldrne r8, [src], pstep + pld [src, #23] + ldrne lr, [src], pstep + + bne MBVnext8 + + add sp, sp, #16 + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_mbloop_filter_vertical_edge_armv6| + +; Constant Pool +c0x80808080 DCD 0x80808080 +c0x03030303 DCD 0x03030303 +c0x04040404 DCD 0x04040404 +c0x01010101 DCD 0x01010101 +c0x7F7F7F7F DCD 0x7F7F7F7F + + END diff --git a/vp8/common/arm/armv6/simpleloopfilter_v6.asm b/vp8/common/arm/armv6/simpleloopfilter_v6.asm new file mode 100644 index 0000000000000000000000000000000000000000..5e00cf01bbd7296ea6bd9455470c1f2d2baa617b --- /dev/null +++ b/vp8/common/arm/armv6/simpleloopfilter_v6.asm @@ -0,0 +1,286 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_loop_filter_simple_horizontal_edge_armv6| + EXPORT |vp8_loop_filter_simple_vertical_edge_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code + + MACRO + TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3 + ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3 + ; a0: 03 02 01 00 + ; a1: 13 12 11 10 + ; a2: 23 22 21 20 + ; a3: 33 32 31 30 + ; b3 b2 b1 b0 + + uxtb16 $b1, $a1 ; xx 12 xx 10 + uxtb16 $b0, $a0 ; xx 02 xx 00 + uxtb16 $b3, $a3 ; xx 32 xx 30 + uxtb16 $b2, $a2 ; xx 22 xx 20 + orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00 + orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20 + + uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11 + uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31 + uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01 + uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21 + orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01 + orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21 + + pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1 + pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3 + + pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0 + pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2 + MEND + + + +src RN r0 +pstep RN r1 + +;r0 unsigned char *src_ptr, +;r1 int src_pixel_step, +;r2 const char *blimit + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp8_loop_filter_simple_horizontal_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + ldrb r12, [r2] ; blimit + ldr r3, [src, -pstep, lsl #1] ; p1 + ldr r4, [src, -pstep] ; p0 + ldr r5, [src] ; q0 + ldr r6, [src, pstep] ; q1 + orr r12, r12, r12, lsl #8 ; blimit + ldr r2, c0x80808080 + orr r12, r12, r12, lsl #16 ; blimit + mov r9, #4 ; double the count. we're doing 4 at a time + mov lr, #0 ; need 0 in a couple places + +|simple_hnext8| + ; vp8_simple_filter_mask() + + uqsub8 r7, r3, r6 ; p1 - q1 + uqsub8 r8, r6, r3 ; q1 - p1 + uqsub8 r10, r4, r5 ; p0 - q0 + uqsub8 r11, r5, r4 ; q0 - p0 + orr r8, r8, r7 ; abs(p1 - q1) + orr r10, r10, r11 ; abs(p0 - q0) + uqadd8 r10, r10, r10 ; abs(p0 - q0) * 2 + uhadd8 r8, r8, lr ; abs(p1 - q2) >> 1 + uqadd8 r10, r10, r8 ; abs(p0 - q0)*2 + abs(p1 - q1)/2 + mvn r8, #0 + usub8 r10, r12, r10 ; compare to flimit. usub8 sets GE flags + sel r10, r8, lr ; filter mask: F or 0 + cmp r10, #0 + beq simple_hskip_filter ; skip filtering if all masks are 0x00 + + ;vp8_simple_filter() + + eor r3, r3, r2 ; p1 offset to convert to a signed value + eor r6, r6, r2 ; q1 offset to convert to a signed value + eor r4, r4, r2 ; p0 offset to convert to a signed value + eor r5, r5, r2 ; q0 offset to convert to a signed value + + qsub8 r3, r3, r6 ; vp8_filter = p1 - q1 + qsub8 r6, r5, r4 ; q0 - p0 + qadd8 r3, r3, r6 ; += q0 - p0 + ldr r7, c0x04040404 + qadd8 r3, r3, r6 ; += q0 - p0 + ldr r8, c0x03030303 + qadd8 r3, r3, r6 ; vp8_filter = p1-q1 + 3*(q0-p0)) + ;STALL + and r3, r3, r10 ; vp8_filter &= mask + + qadd8 r7 , r3 , r7 ; Filter1 = vp8_filter + 4 + qadd8 r8 , r3 , r8 ; Filter2 = vp8_filter + 3 + + shadd8 r7 , r7 , lr + shadd8 r8 , r8 , lr + shadd8 r7 , r7 , lr + shadd8 r8 , r8 , lr + shadd8 r7 , r7 , lr ; Filter1 >>= 3 + shadd8 r8 , r8 , lr ; Filter2 >>= 3 + + qsub8 r5 ,r5, r7 ; u = q0 - Filter1 + qadd8 r4, r4, r8 ; u = p0 + Filter2 + eor r5, r5, r2 ; *oq0 = u^0x80 + str r5, [src] ; store oq0 result + eor r4, r4, r2 ; *op0 = u^0x80 + str r4, [src, -pstep] ; store op0 result + +|simple_hskip_filter| + subs r9, r9, #1 + addne src, src, #4 ; next row + + ldrne r3, [src, -pstep, lsl #1] ; p1 + ldrne r4, [src, -pstep] ; p0 + ldrne r5, [src] ; q0 + ldrne r6, [src, pstep] ; q1 + + bne simple_hnext8 + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_loop_filter_simple_horizontal_edge_armv6| + + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp8_loop_filter_simple_vertical_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + ldrb r12, [r2] ; r12: blimit + ldr r2, c0x80808080 + orr r12, r12, r12, lsl #8 + + ; load soure data to r7, r8, r9, r10 + ldrh r3, [src, #-2] + pld [src, #23] ; preload for next block + ldrh r4, [src], pstep + orr r12, r12, r12, lsl #16 + + ldrh r5, [src, #-2] + pld [src, #23] + ldrh r6, [src], pstep + + pkhbt r7, r3, r4, lsl #16 + + ldrh r3, [src, #-2] + pld [src, #23] + ldrh r4, [src], pstep + + pkhbt r8, r5, r6, lsl #16 + + ldrh r5, [src, #-2] + pld [src, #23] + ldrh r6, [src], pstep + mov r11, #4 ; double the count. we're doing 4 at a time + +|simple_vnext8| + ; vp8_simple_filter_mask() function + pkhbt r9, r3, r4, lsl #16 + pkhbt r10, r5, r6, lsl #16 + + ;transpose r7, r8, r9, r10 to r3, r4, r5, r6 + TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6 + + uqsub8 r7, r3, r6 ; p1 - q1 + uqsub8 r8, r6, r3 ; q1 - p1 + uqsub8 r9, r4, r5 ; p0 - q0 + uqsub8 r10, r5, r4 ; q0 - p0 + orr r7, r7, r8 ; abs(p1 - q1) + orr r9, r9, r10 ; abs(p0 - q0) + mov r8, #0 + uqadd8 r9, r9, r9 ; abs(p0 - q0) * 2 + uhadd8 r7, r7, r8 ; abs(p1 - q1) / 2 + uqadd8 r7, r7, r9 ; abs(p0 - q0)*2 + abs(p1 - q1)/2 + mvn r10, #0 ; r10 == -1 + + usub8 r7, r12, r7 ; compare to flimit + sel lr, r10, r8 ; filter mask + + cmp lr, #0 + beq simple_vskip_filter ; skip filtering + + ;vp8_simple_filter() function + eor r3, r3, r2 ; p1 offset to convert to a signed value + eor r6, r6, r2 ; q1 offset to convert to a signed value + eor r4, r4, r2 ; p0 offset to convert to a signed value + eor r5, r5, r2 ; q0 offset to convert to a signed value + + qsub8 r3, r3, r6 ; vp8_filter = p1 - q1 + qsub8 r6, r5, r4 ; q0 - p0 + + qadd8 r3, r3, r6 ; vp8_filter += q0 - p0 + ldr r9, c0x03030303 ; r9 = 3 + + qadd8 r3, r3, r6 ; vp8_filter += q0 - p0 + ldr r7, c0x04040404 + + qadd8 r3, r3, r6 ; vp8_filter = p1-q1 + 3*(q0-p0)) + ;STALL + and r3, r3, lr ; vp8_filter &= mask + + qadd8 r9 , r3 , r9 ; Filter2 = vp8_filter + 3 + qadd8 r3 , r3 , r7 ; Filter1 = vp8_filter + 4 + + shadd8 r9 , r9 , r8 + shadd8 r3 , r3 , r8 + shadd8 r9 , r9 , r8 + shadd8 r3 , r3 , r8 + shadd8 r9 , r9 , r8 ; Filter2 >>= 3 + shadd8 r3 , r3 , r8 ; Filter1 >>= 3 + + ;calculate output + sub src, src, pstep, lsl #2 + + qadd8 r4, r4, r9 ; u = p0 + Filter2 + qsub8 r5, r5, r3 ; u = q0 - Filter1 + eor r4, r4, r2 ; *op0 = u^0x80 + eor r5, r5, r2 ; *oq0 = u^0x80 + + strb r4, [src, #-1] ; store the result + mov r4, r4, lsr #8 + strb r5, [src], pstep + mov r5, r5, lsr #8 + + strb r4, [src, #-1] + mov r4, r4, lsr #8 + strb r5, [src], pstep + mov r5, r5, lsr #8 + + strb r4, [src, #-1] + mov r4, r4, lsr #8 + strb r5, [src], pstep + mov r5, r5, lsr #8 + + strb r4, [src, #-1] + strb r5, [src], pstep + +|simple_vskip_filter| + subs r11, r11, #1 + + ; load soure data to r7, r8, r9, r10 + ldrneh r3, [src, #-2] + pld [src, #23] ; preload for next block + ldrneh r4, [src], pstep + + ldrneh r5, [src, #-2] + pld [src, #23] + ldrneh r6, [src], pstep + + pkhbt r7, r3, r4, lsl #16 + + ldrneh r3, [src, #-2] + pld [src, #23] + ldrneh r4, [src], pstep + + pkhbt r8, r5, r6, lsl #16 + + ldrneh r5, [src, #-2] + pld [src, #23] + ldrneh r6, [src], pstep + + bne simple_vnext8 + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_loop_filter_simple_vertical_edge_armv6| + +; Constant Pool +c0x80808080 DCD 0x80808080 +c0x03030303 DCD 0x03030303 +c0x04040404 DCD 0x04040404 + + END diff --git a/vp8/common/arm/armv6/sixtappredict8x4_v6.asm b/vp8/common/arm/armv6/sixtappredict8x4_v6.asm new file mode 100644 index 0000000000000000000000000000000000000000..e81aef53d5146a285582550b2d27538ed8d4ffea --- /dev/null +++ b/vp8/common/arm/armv6/sixtappredict8x4_v6.asm @@ -0,0 +1,273 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_sixtap_predict8x4_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code +;------------------------------------- +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; stack unsigned char *dst_ptr, +; stack int dst_pitch +;------------------------------------- +;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184. +;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack, +;and the result is stored in transpose. +|vp8_sixtap_predict8x4_armv6| PROC + stmdb sp!, {r4 - r11, lr} + str r3, [sp, #-184]! ;reserve space on stack for temporary storage, store yoffset + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + add lr, sp, #4 ;point to temporary buffer + beq skip_firstpass_filter + +;first-pass filter + adr r12, filter8_coeff + sub r0, r0, r1, lsl #1 + + add r3, r1, #10 ; preload next low + pld [r0, r3] + + add r2, r12, r2, lsl #4 ;calculate filter location + add r0, r0, #3 ;adjust src only for loading convinience + + ldr r3, [r2] ; load up packed filter coefficients + ldr r4, [r2, #4] + ldr r5, [r2, #8] + + mov r2, #0x90000 ; height=9 is top part of counter + + sub r1, r1, #8 + +|first_pass_hloop_v6| + ldrb r6, [r0, #-5] ; load source data + ldrb r7, [r0, #-4] + ldrb r8, [r0, #-3] + ldrb r9, [r0, #-2] + ldrb r10, [r0, #-1] + + orr r2, r2, #0x4 ; construct loop counter. width=8=4x2 + + pkhbt r6, r6, r7, lsl #16 ; r7 | r6 + pkhbt r7, r7, r8, lsl #16 ; r8 | r7 + + pkhbt r8, r8, r9, lsl #16 ; r9 | r8 + pkhbt r9, r9, r10, lsl #16 ; r10 | r9 + +|first_pass_wloop_v6| + smuad r11, r6, r3 ; vp8_filter[0], vp8_filter[1] + smuad r12, r7, r3 + + ldrb r6, [r0], #1 + + smlad r11, r8, r4, r11 ; vp8_filter[2], vp8_filter[3] + ldrb r7, [r0], #1 + smlad r12, r9, r4, r12 + + pkhbt r10, r10, r6, lsl #16 ; r10 | r9 + pkhbt r6, r6, r7, lsl #16 ; r11 | r10 + smlad r11, r10, r5, r11 ; vp8_filter[4], vp8_filter[5] + smlad r12, r6, r5, r12 + + sub r2, r2, #1 + + add r11, r11, #0x40 ; round_shift_and_clamp + tst r2, #0xff ; test loop counter + usat r11, #8, r11, asr #7 + add r12, r12, #0x40 + strh r11, [lr], #20 ; result is transposed and stored, which + usat r12, #8, r12, asr #7 + + strh r12, [lr], #20 + + movne r11, r6 + movne r12, r7 + + movne r6, r8 + movne r7, r9 + movne r8, r10 + movne r9, r11 + movne r10, r12 + + bne first_pass_wloop_v6 + + ;;add r9, ppl, #30 ; attempt to load 2 adjacent cache lines + ;;IF ARCHITECTURE=6 + ;pld [src, ppl] + ;;pld [src, r9] + ;;ENDIF + + subs r2, r2, #0x10000 + + sub lr, lr, #158 + + add r0, r0, r1 ; move to next input line + + add r11, r1, #18 ; preload next low. adding back block width(=8), which is subtracted earlier + pld [r0, r11] + + bne first_pass_hloop_v6 + +;second pass filter +secondpass_filter + ldr r3, [sp], #4 ; load back yoffset + ldr r0, [sp, #216] ; load dst address from stack 180+36 + ldr r1, [sp, #220] ; load dst stride from stack 180+40 + + cmp r3, #0 + beq skip_secondpass_filter + + adr r12, filter8_coeff + add lr, r12, r3, lsl #4 ;calculate filter location + + mov r2, #0x00080000 + + ldr r3, [lr] ; load up packed filter coefficients + ldr r4, [lr, #4] + ldr r5, [lr, #8] + + pkhbt r12, r4, r3 ; pack the filter differently + pkhbt r11, r5, r4 + +second_pass_hloop_v6 + ldr r6, [sp] ; load the data + ldr r7, [sp, #4] + + orr r2, r2, #2 ; loop counter + +second_pass_wloop_v6 + smuad lr, r3, r6 ; apply filter + smulbt r10, r3, r6 + + ldr r8, [sp, #8] + + smlad lr, r4, r7, lr + smladx r10, r12, r7, r10 + + ldrh r9, [sp, #12] + + smlad lr, r5, r8, lr + smladx r10, r11, r8, r10 + + add sp, sp, #4 + smlatb r10, r5, r9, r10 + + sub r2, r2, #1 + + add lr, lr, #0x40 ; round_shift_and_clamp + tst r2, #0xff + usat lr, #8, lr, asr #7 + add r10, r10, #0x40 + strb lr, [r0], r1 ; the result is transposed back and stored + usat r10, #8, r10, asr #7 + + strb r10, [r0],r1 + + movne r6, r7 + movne r7, r8 + + bne second_pass_wloop_v6 + + subs r2, r2, #0x10000 + add sp, sp, #12 ; updata src for next loop (20-8) + sub r0, r0, r1, lsl #2 + add r0, r0, #1 + + bne second_pass_hloop_v6 + + add sp, sp, #20 + ldmia sp!, {r4 - r11, pc} + +;-------------------- +skip_firstpass_filter + sub r0, r0, r1, lsl #1 + sub r1, r1, #8 + mov r2, #9 + +skip_firstpass_hloop + ldrb r4, [r0], #1 ; load data + subs r2, r2, #1 + ldrb r5, [r0], #1 + strh r4, [lr], #20 ; store it to immediate buffer + ldrb r6, [r0], #1 ; load data + strh r5, [lr], #20 + ldrb r7, [r0], #1 + strh r6, [lr], #20 + ldrb r8, [r0], #1 + strh r7, [lr], #20 + ldrb r9, [r0], #1 + strh r8, [lr], #20 + ldrb r10, [r0], #1 + strh r9, [lr], #20 + ldrb r11, [r0], #1 + strh r10, [lr], #20 + add r0, r0, r1 ; move to next input line + strh r11, [lr], #20 + + sub lr, lr, #158 ; move over to next column + bne skip_firstpass_hloop + + b secondpass_filter + +;-------------------- +skip_secondpass_filter + mov r2, #8 + add sp, sp, #4 ;start from src[0] instead of src[-2] + +skip_secondpass_hloop + ldr r6, [sp], #4 + subs r2, r2, #1 + ldr r8, [sp], #4 + + mov r7, r6, lsr #16 ; unpack + strb r6, [r0], r1 + mov r9, r8, lsr #16 + strb r7, [r0], r1 + add sp, sp, #12 ; 20-8 + strb r8, [r0], r1 + strb r9, [r0], r1 + + sub r0, r0, r1, lsl #2 + add r0, r0, #1 + + bne skip_secondpass_hloop + + add sp, sp, #16 ; 180 - (160 +4) + + ldmia sp!, {r4 - r11, pc} + + ENDP + +;----------------- +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +filter8_coeff + DCD 0x00000000, 0x00000080, 0x00000000, 0x00000000 + DCD 0xfffa0000, 0x000c007b, 0x0000ffff, 0x00000000 + DCD 0xfff50002, 0x0024006c, 0x0001fff8, 0x00000000 + DCD 0xfff70000, 0x0032005d, 0x0000fffa, 0x00000000 + DCD 0xfff00003, 0x004d004d, 0x0003fff0, 0x00000000 + DCD 0xfffa0000, 0x005d0032, 0x0000fff7, 0x00000000 + DCD 0xfff80001, 0x006c0024, 0x0002fff5, 0x00000000 + DCD 0xffff0000, 0x007b000c, 0x0000fffa, 0x00000000 + + ;DCD 0, 0, 128, 0, 0, 0 + ;DCD 0, -6, 123, 12, -1, 0 + ;DCD 2, -11, 108, 36, -8, 1 + ;DCD 0, -9, 93, 50, -6, 0 + ;DCD 3, -16, 77, 77, -16, 3 + ;DCD 0, -6, 50, 93, -9, 0 + ;DCD 1, -8, 36, 108, -11, 2 + ;DCD 0, -1, 12, 123, -6, 0 + + END diff --git a/vp8/common/arm/armv6/vp8_sad16x16_armv6.asm b/vp8/common/arm/armv6/vp8_sad16x16_armv6.asm new file mode 100644 index 0000000000000000000000000000000000000000..1b4f5cf3b0f51a26e8ef5dbb02e809c8764b766b --- /dev/null +++ b/vp8/common/arm/armv6/vp8_sad16x16_armv6.asm @@ -0,0 +1,96 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_sad16x16_armv6| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 const unsigned char *src_ptr +; r1 int src_stride +; r2 const unsigned char *ref_ptr +; r3 int ref_stride +; stack max_sad (not used) +|vp8_sad16x16_armv6| PROC + stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + pld [r0, r1, lsl #1] + pld [r2, r3, lsl #1] + + mov r4, #0 ; sad = 0; + mov r5, #8 ; loop count + +loop + ; 1st row + ldr r6, [r0, #0x0] ; load 4 src pixels (1A) + ldr r8, [r2, #0x0] ; load 4 ref pixels (1A) + ldr r7, [r0, #0x4] ; load 4 src pixels (1A) + ldr r9, [r2, #0x4] ; load 4 ref pixels (1A) + ldr r10, [r0, #0x8] ; load 4 src pixels (1B) + ldr r11, [r0, #0xC] ; load 4 src pixels (1B) + + usada8 r4, r8, r6, r4 ; calculate sad for 4 pixels + usad8 r8, r7, r9 ; calculate sad for 4 pixels + + ldr r12, [r2, #0x8] ; load 4 ref pixels (1B) + ldr lr, [r2, #0xC] ; load 4 ref pixels (1B) + + add r0, r0, r1 ; set src pointer to next row + add r2, r2, r3 ; set dst pointer to next row + + pld [r0, r1, lsl #1] + pld [r2, r3, lsl #1] + + usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels + usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels + + ldr r6, [r0, #0x0] ; load 4 src pixels (2A) + ldr r7, [r0, #0x4] ; load 4 src pixels (2A) + add r4, r4, r8 ; add partial sad values + + ; 2nd row + ldr r8, [r2, #0x0] ; load 4 ref pixels (2A) + ldr r9, [r2, #0x4] ; load 4 ref pixels (2A) + ldr r10, [r0, #0x8] ; load 4 src pixels (2B) + ldr r11, [r0, #0xC] ; load 4 src pixels (2B) + + usada8 r4, r6, r8, r4 ; calculate sad for 4 pixels + usad8 r8, r7, r9 ; calculate sad for 4 pixels + + ldr r12, [r2, #0x8] ; load 4 ref pixels (2B) + ldr lr, [r2, #0xC] ; load 4 ref pixels (2B) + + add r0, r0, r1 ; set src pointer to next row + add r2, r2, r3 ; set dst pointer to next row + + usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels + usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels + + pld [r0, r1, lsl #1] + pld [r2, r3, lsl #1] + + subs r5, r5, #1 ; decrement loop counter + add r4, r4, r8 ; add partial sad values + + bne loop + + mov r0, r4 ; return sad + ldmfd sp!, {r4-r12, pc} + + ENDP + + END + diff --git a/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm b/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm new file mode 100644 index 0000000000000000000000000000000000000000..dc84c30daf5357d853cbea53e9abd77c9e26bae7 --- /dev/null +++ b/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm @@ -0,0 +1,154 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_variance16x16_armv6| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vp8_variance16x16_armv6| PROC + + stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r8, #0 ; initialize sum = 0 + mov r11, #0 ; initialize sse = 0 + mov r12, #16 ; set loop counter to 16 (=block height) + +loop + ; 1st 4 pixels + ldr r4, [r0, #0] ; load 4 src pixels + ldr r5, [r2, #0] ; load 4 ref pixels + + mov lr, #0 ; constant zero + + usub8 r6, r4, r5 ; calculate difference + pld [r0, r1, lsl #1] + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + ; calculate total sum + adds r8, r8, r4 ; add positive differences to sum + subs r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r4, [r0, #4] ; load 4 src pixels + ldr r5, [r2, #4] ; load 4 ref pixels + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r4, [r0, #8] ; load 4 src pixels + ldr r5, [r2, #8] ; load 4 ref pixels + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r4, [r0, #12] ; load 4 src pixels + ldr r5, [r2, #12] ; load 4 ref pixels + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + + subs r12, r12, #1 + + bne loop + + ; return stuff + ldr r6, [sp, #40] ; get address of sse + mul r0, r8, r8 ; sum * sum + str r11, [r6] ; store sse + sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) + + ldmfd sp!, {r4-r12, pc} + + ENDP + + END + diff --git a/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm b/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm new file mode 100644 index 0000000000000000000000000000000000000000..adc353d2006e21d854529ff1e7874ae13ba2408c --- /dev/null +++ b/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm @@ -0,0 +1,101 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_variance8x8_armv6| + + ARM + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vp8_variance8x8_armv6| PROC + + push {r4-r10, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r12, #8 ; set loop counter to 8 (=block height) + mov r4, #0 ; initialize sum = 0 + mov r5, #0 ; initialize sse = 0 + +loop + ; 1st 4 pixels + ldr r6, [r0, #0x0] ; load 4 src pixels + ldr r7, [r2, #0x0] ; load 4 ref pixels + + mov lr, #0 ; constant zero + + usub8 r8, r6, r7 ; calculate difference + pld [r0, r1, lsl #1] + sel r10, r8, lr ; select bytes with positive difference + usub8 r9, r7, r6 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r6, r10, lr ; calculate sum of positive differences + usad8 r7, r8, lr ; calculate sum of negative differences + orr r8, r8, r10 ; differences of all 4 pixels + ; calculate total sum + add r4, r4, r6 ; add positive differences to sum + sub r4, r4, r7 ; substract negative differences from sum + + ; calculate sse + uxtb16 r7, r8 ; byte (two pixels) to halfwords + uxtb16 r10, r8, ror #8 ; another two pixels to halfwords + smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r6, [r0, #0x4] ; load 4 src pixels + ldr r7, [r2, #0x4] ; load 4 ref pixels + smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) + + usub8 r8, r6, r7 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r10, r8, lr ; select bytes with positive difference + usub8 r9, r7, r6 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r6, r10, lr ; calculate sum of positive differences + usad8 r7, r8, lr ; calculate sum of negative differences + orr r8, r8, r10 ; differences of all 4 pixels + + ; calculate total sum + add r4, r4, r6 ; add positive differences to sum + sub r4, r4, r7 ; substract negative differences from sum + + ; calculate sse + uxtb16 r7, r8 ; byte (two pixels) to halfwords + uxtb16 r10, r8, ror #8 ; another two pixels to halfwords + smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) + subs r12, r12, #1 ; next row + smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) + + bne loop + + ; return stuff + ldr r8, [sp, #32] ; get address of sse + mul r1, r4, r4 ; sum * sum + str r5, [r8] ; store sse + sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6)) + + pop {r4-r10, pc} + + ENDP + + END diff --git a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm b/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm new file mode 100644 index 0000000000000000000000000000000000000000..dd2ce685c8bc7e71198a1ade8fcf06c15d4ab749 --- /dev/null +++ b/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm @@ -0,0 +1,182 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_variance_halfpixvar16x16_h_armv6| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vp8_variance_halfpixvar16x16_h_armv6| PROC + + stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r8, #0 ; initialize sum = 0 + ldr r10, c80808080 + mov r11, #0 ; initialize sse = 0 + mov r12, #16 ; set loop counter to 16 (=block height) + mov lr, #0 ; constant zero +loop + ; 1st 4 pixels + ldr r4, [r0, #0] ; load 4 src pixels + ldr r6, [r0, #1] ; load 4 src pixels with 1 byte offset + ldr r5, [r2, #0] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + pld [r0, r1, lsl #1] + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + ; calculate total sum + adds r8, r8, r4 ; add positive differences to sum + subs r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r4, [r0, #4] ; load 4 src pixels + ldr r6, [r0, #5] ; load 4 src pixels with 1 byte offset + ldr r5, [r2, #4] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r4, [r0, #8] ; load 4 src pixels + ldr r6, [r0, #9] ; load 4 src pixels with 1 byte offset + ldr r5, [r2, #8] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r4, [r0, #12] ; load 4 src pixels + ldr r6, [r0, #13] ; load 4 src pixels with 1 byte offset + ldr r5, [r2, #12] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + subs r12, r12, #1 + + bne loop + + ; return stuff + ldr r6, [sp, #40] ; get address of sse + mul r0, r8, r8 ; sum * sum + str r11, [r6] ; store sse + sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) + + ldmfd sp!, {r4-r12, pc} + + ENDP + +c80808080 + DCD 0x80808080 + + END + diff --git a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm b/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm new file mode 100644 index 0000000000000000000000000000000000000000..f972d9b5bac4b789e0160e15350539421a0fe7e0 --- /dev/null +++ b/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm @@ -0,0 +1,222 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_variance_halfpixvar16x16_hv_armv6| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vp8_variance_halfpixvar16x16_hv_armv6| PROC + + stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r8, #0 ; initialize sum = 0 + ldr r10, c80808080 + mov r11, #0 ; initialize sse = 0 + mov r12, #16 ; set loop counter to 16 (=block height) + mov lr, #0 ; constant zero +loop + add r9, r0, r1 ; pointer to pixels on the next row + ; 1st 4 pixels + ldr r4, [r0, #0] ; load source pixels a, row N + ldr r6, [r0, #1] ; load source pixels b, row N + ldr r5, [r9, #0] ; load source pixels c, row N+1 + ldr r7, [r9, #1] ; load source pixels d, row N+1 + + ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 + mvn r7, r7 + uhsub8 r5, r5, r7 + eor r5, r5, r10 + ; z = (x + y + 1) >> 1, interpolate half pixel values vertically + mvn r5, r5 + uhsub8 r4, r4, r5 + ldr r5, [r2, #0] ; load 4 ref pixels + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + pld [r0, r1, lsl #1] + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + ; calculate total sum + adds r8, r8, r4 ; add positive differences to sum + subs r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r4, [r0, #4] ; load source pixels a, row N + ldr r6, [r0, #5] ; load source pixels b, row N + ldr r5, [r9, #4] ; load source pixels c, row N+1 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + ldr r7, [r9, #5] ; load source pixels d, row N+1 + + ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 + mvn r7, r7 + uhsub8 r5, r5, r7 + eor r5, r5, r10 + ; z = (x + y + 1) >> 1, interpolate half pixel values vertically + mvn r5, r5 + uhsub8 r4, r4, r5 + ldr r5, [r2, #4] ; load 4 ref pixels + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r4, [r0, #8] ; load source pixels a, row N + ldr r6, [r0, #9] ; load source pixels b, row N + ldr r5, [r9, #8] ; load source pixels c, row N+1 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + ldr r7, [r9, #9] ; load source pixels d, row N+1 + + ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 + mvn r7, r7 + uhsub8 r5, r5, r7 + eor r5, r5, r10 + ; z = (x + y + 1) >> 1, interpolate half pixel values vertically + mvn r5, r5 + uhsub8 r4, r4, r5 + ldr r5, [r2, #8] ; load 4 ref pixels + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r4, [r0, #12] ; load source pixels a, row N + ldr r6, [r0, #13] ; load source pixels b, row N + ldr r5, [r9, #12] ; load source pixels c, row N+1 + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + ldr r7, [r9, #13] ; load source pixels d, row N+1 + + ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 + mvn r7, r7 + uhsub8 r5, r5, r7 + eor r5, r5, r10 + ; z = (x + y + 1) >> 1, interpolate half pixel values vertically + mvn r5, r5 + uhsub8 r4, r4, r5 + ldr r5, [r2, #12] ; load 4 ref pixels + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + subs r12, r12, #1 + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + bne loop + + ; return stuff + ldr r6, [sp, #40] ; get address of sse + mul r0, r8, r8 ; sum * sum + str r11, [r6] ; store sse + sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) + + ldmfd sp!, {r4-r12, pc} + + ENDP + +c80808080 + DCD 0x80808080 + + END diff --git a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm b/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm new file mode 100644 index 0000000000000000000000000000000000000000..f5da9c09eedd4cdbd516782a5c25eb919409c50e --- /dev/null +++ b/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm @@ -0,0 +1,184 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_variance_halfpixvar16x16_v_armv6| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vp8_variance_halfpixvar16x16_v_armv6| PROC + + stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r8, #0 ; initialize sum = 0 + ldr r10, c80808080 + mov r11, #0 ; initialize sse = 0 + mov r12, #16 ; set loop counter to 16 (=block height) + mov lr, #0 ; constant zero +loop + add r9, r0, r1 ; set src pointer to next row + ; 1st 4 pixels + ldr r4, [r0, #0] ; load 4 src pixels + ldr r6, [r9, #0] ; load 4 src pixels from next row + ldr r5, [r2, #0] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + pld [r0, r1, lsl #1] + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + ; calculate total sum + adds r8, r8, r4 ; add positive differences to sum + subs r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r4, [r0, #4] ; load 4 src pixels + ldr r6, [r9, #4] ; load 4 src pixels from next row + ldr r5, [r2, #4] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r4, [r0, #8] ; load 4 src pixels + ldr r6, [r9, #8] ; load 4 src pixels from next row + ldr r5, [r2, #8] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r4, [r0, #12] ; load 4 src pixels + ldr r6, [r9, #12] ; load 4 src pixels from next row + ldr r5, [r2, #12] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + + subs r12, r12, #1 + + bne loop + + ; return stuff + ldr r6, [sp, #40] ; get address of sse + mul r0, r8, r8 ; sum * sum + str r11, [r6] ; store sse + sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) + + ldmfd sp!, {r4-r12, pc} + + ENDP + +c80808080 + DCD 0x80808080 + + END + diff --git a/vp8/common/arm/bilinearfilter_arm.c b/vp8/common/arm/bilinearfilter_arm.c new file mode 100644 index 0000000000000000000000000000000000000000..c63073c779a6ef483bdd1a8f653d9e60ef54eb20 --- /dev/null +++ b/vp8/common/arm/bilinearfilter_arm.c @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vpx_rtcd.h" +#include +#include "vp8/common/filter.h" +#include "bilinearfilter_arm.h" + +void vp8_filter_block2d_bil_armv6 +( + unsigned char *src_ptr, + unsigned char *dst_ptr, + unsigned int src_pitch, + unsigned int dst_pitch, + const short *HFilter, + const short *VFilter, + int Width, + int Height +) +{ + unsigned short FData[36*16]; /* Temp data buffer used in filtering */ + + /* First filter 1-D horizontally... */ + vp8_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pitch, Height + 1, Width, HFilter); + + /* then 1-D vertically... */ + vp8_filter_block2d_bil_second_pass_armv6(FData, dst_ptr, dst_pitch, Height, Width, VFilter); +} + + +void vp8_bilinear_predict4x4_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4); +} + +void vp8_bilinear_predict8x8_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8); +} + +void vp8_bilinear_predict8x4_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4); +} + +void vp8_bilinear_predict16x16_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16); +} diff --git a/vp8/common/arm/bilinearfilter_arm.h b/vp8/common/arm/bilinearfilter_arm.h new file mode 100644 index 0000000000000000000000000000000000000000..b7155d3f0a5c6413b89786055a9669e08442293f --- /dev/null +++ b/vp8/common/arm/bilinearfilter_arm.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef BILINEARFILTER_ARM_H +#define BILINEARFILTER_ARM_H + +extern void vp8_filter_block2d_bil_first_pass_armv6 +( + const unsigned char *src_ptr, + unsigned short *dst_ptr, + unsigned int src_pitch, + unsigned int height, + unsigned int width, + const short *vp8_filter +); + +extern void vp8_filter_block2d_bil_second_pass_armv6 +( + const unsigned short *src_ptr, + unsigned char *dst_ptr, + int dst_pitch, + unsigned int height, + unsigned int width, + const short *vp8_filter +); + +#endif /* BILINEARFILTER_ARM_H */ diff --git a/vp8/common/arm/dequantize_arm.c b/vp8/common/arm/dequantize_arm.c new file mode 100644 index 0000000000000000000000000000000000000000..70e72aa4774535e87b7fc0a516910d9b6cf9e318 --- /dev/null +++ b/vp8/common/arm/dequantize_arm.c @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vp8/common/blockd.h" + +#if HAVE_NEON +extern void vp8_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ); +#endif + +#if HAVE_MEDIA +extern void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ); +#endif + +#if HAVE_NEON + +void vp8_dequantize_b_neon(BLOCKD *d, short *DQC) +{ + short *DQ = d->dqcoeff; + short *Q = d->qcoeff; + + vp8_dequantize_b_loop_neon(Q, DQC, DQ); +} +#endif + +#if HAVE_MEDIA +void vp8_dequantize_b_v6(BLOCKD *d, short *DQC) +{ + short *DQ = d->dqcoeff; + short *Q = d->qcoeff; + + vp8_dequantize_b_loop_v6(Q, DQC, DQ); +} +#endif diff --git a/vp8/common/arm/filter_arm.c b/vp8/common/arm/filter_arm.c new file mode 100644 index 0000000000000000000000000000000000000000..148951a92ffb3951d95a5df5042c04ba76ffcb5d --- /dev/null +++ b/vp8/common/arm/filter_arm.c @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vpx_rtcd.h" +#include +#include "vp8/common/filter.h" +#include "vpx_ports/mem.h" + +extern void vp8_filter_block2d_first_pass_armv6 +( + unsigned char *src_ptr, + short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_width, + unsigned int output_height, + const short *vp8_filter +); + +// 8x8 +extern void vp8_filter_block2d_first_pass_8x8_armv6 +( + unsigned char *src_ptr, + short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_width, + unsigned int output_height, + const short *vp8_filter +); + +// 16x16 +extern void vp8_filter_block2d_first_pass_16x16_armv6 +( + unsigned char *src_ptr, + short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_width, + unsigned int output_height, + const short *vp8_filter +); + +extern void vp8_filter_block2d_second_pass_armv6 +( + short *src_ptr, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int cnt, + const short *vp8_filter +); + +extern void vp8_filter4_block2d_second_pass_armv6 +( + short *src_ptr, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int cnt, + const short *vp8_filter +); + +extern void vp8_filter_block2d_first_pass_only_armv6 +( + unsigned char *src_ptr, + unsigned char *output_ptr, + unsigned int src_pixels_per_line, + unsigned int cnt, + unsigned int output_pitch, + const short *vp8_filter +); + + +extern void vp8_filter_block2d_second_pass_only_armv6 +( + unsigned char *src_ptr, + unsigned char *output_ptr, + unsigned int src_pixels_per_line, + unsigned int cnt, + unsigned int output_pitch, + const short *vp8_filter +); + +#if HAVE_MEDIA +void vp8_sixtap_predict4x4_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + DECLARE_ALIGNED_ARRAY(4, short, FData, 12*4); /* Temp data buffer used in filtering */ + + + HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ + VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ + + /* Vfilter is null. First pass only */ + if (xoffset && !yoffset) + { + /*vp8_filter_block2d_first_pass_armv6 ( src_ptr, FData+2, src_pixels_per_line, 4, 4, HFilter ); + vp8_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, VFilter );*/ + + vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, HFilter); + } + /* Hfilter is null. Second pass only */ + else if (!xoffset && yoffset) + { + vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, VFilter); + } + else + { + /* Vfilter is a 4 tap filter */ + if (yoffset & 0x1) + { + vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 4, 7, HFilter); + vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter); + } + /* Vfilter is 6 tap filter */ + else + { + vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 4, 9, HFilter); + vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter); + } + } +} + +void vp8_sixtap_predict8x8_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data buffer used in filtering */ + + HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ + VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ + + if (xoffset && !yoffset) + { + vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter); + } + /* Hfilter is null. Second pass only */ + else if (!xoffset && yoffset) + { + vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter); + } + else + { + if (yoffset & 0x1) + { + vp8_filter_block2d_first_pass_8x8_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter); + vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter); + } + else + { + vp8_filter_block2d_first_pass_8x8_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter); + vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter); + } + } +} + + +void vp8_sixtap_predict16x16_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + DECLARE_ALIGNED_ARRAY(4, short, FData, 24*16); /* Temp data buffer used in filtering */ + + HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ + VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ + + if (xoffset && !yoffset) + { + vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, HFilter); + } + /* Hfilter is null. Second pass only */ + else if (!xoffset && yoffset) + { + vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, VFilter); + } + else + { + if (yoffset & 0x1) + { + vp8_filter_block2d_first_pass_16x16_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter); + vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter); + } + else + { + vp8_filter_block2d_first_pass_16x16_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter); + vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter); + } + } + +} +#endif diff --git a/vp8/common/arm/loopfilter_arm.c b/vp8/common/arm/loopfilter_arm.c new file mode 100644 index 0000000000000000000000000000000000000000..b8f9bd90efa726ec0a85d670b2d460cc1dbe85ff --- /dev/null +++ b/vp8/common/arm/loopfilter_arm.c @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vpx_rtcd.h" +#include "vp8/common/loopfilter.h" +#include "vp8/common/onyxc_int.h" + +#define prototype_loopfilter(sym) \ + void sym(unsigned char *src, int pitch, const unsigned char *blimit,\ + const unsigned char *limit, const unsigned char *thresh, int count) + +#if HAVE_MEDIA +extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6); +extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6); +extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6); +extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6); +#endif + +#if HAVE_NEON +typedef void loopfilter_y_neon(unsigned char *src, int pitch, + unsigned char blimit, unsigned char limit, unsigned char thresh); +typedef void loopfilter_uv_neon(unsigned char *u, int pitch, + unsigned char blimit, unsigned char limit, unsigned char thresh, + unsigned char *v); + +extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon; +extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon; +extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon; +extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon; + +extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon; +extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon; +extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon; +extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon; +#endif + +#if HAVE_MEDIA +/* ARMV6/MEDIA loopfilter functions*/ +/* Horizontal MB filtering */ +void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); +} + +/* Vertical MB Filtering */ +void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); +} + +/* Horizontal B Filtering */ +void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); +} + +void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) +{ + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit); +} + +/* Vertical B Filtering */ +void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); +} + +void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) +{ + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit); +} +#endif + +#if HAVE_NEON +/* NEON loopfilter functions */ +/* Horizontal MB filtering */ +void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + unsigned char mblim = *lfi->mblim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; + vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr); + + if (u_ptr) + vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr); +} + +/* Vertical MB Filtering */ +void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + unsigned char mblim = *lfi->mblim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; + + vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr); + + if (u_ptr) + vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr); +} + +/* Horizontal B Filtering */ +void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + unsigned char blim = *lfi->blim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; + + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, lim, hev_thr); + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, lim, hev_thr); + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, lim, hev_thr); + + if (u_ptr) + vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, blim, lim, hev_thr, v_ptr + 4 * uv_stride); +} + +/* Vertical B Filtering */ +void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + unsigned char blim = *lfi->blim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; + + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr); + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr); + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, hev_thr); + + if (u_ptr) + vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4); +} +#endif diff --git a/vp8/common/arm/neon/bilinearpredict16x16_neon.asm b/vp8/common/arm/neon/bilinearpredict16x16_neon.asm new file mode 100644 index 0000000000000000000000000000000000000000..e392786d43d8097aadb3155d8a404b7a920aedf6 --- /dev/null +++ b/vp8/common/arm/neon/bilinearpredict16x16_neon.asm @@ -0,0 +1,357 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_bilinear_predict16x16_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; r4 unsigned char *dst_ptr, +; stack(r5) int dst_pitch + +|vp8_bilinear_predict16x16_neon| PROC + push {r4-r5, lr} + + adr r12, bifilter16_coeff + ldr r4, [sp, #12] ;load parameters from stack + ldr r5, [sp, #16] ;load parameters from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq secondpass_bfilter16x16_only + + add r2, r12, r2, lsl #3 ;calculate filter location + + cmp r3, #0 ;skip second_pass filter if yoffset=0 + + vld1.s32 {d31}, [r2] ;load first_pass filter + + beq firstpass_bfilter16x16_only + + sub sp, sp, #272 ;reserve space on stack for temporary storage + vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data + mov lr, sp + vld1.u8 {d5, d6, d7}, [r0], r1 + + mov r2, #3 ;loop counter + vld1.u8 {d8, d9, d10}, [r0], r1 + + vdup.8 d0, d31[0] ;first_pass filter (d0 d1) + vld1.u8 {d11, d12, d13}, [r0], r1 + + vdup.8 d1, d31[4] + +;First Pass: output_height lines x output_width columns (17x16) +filt_blk2d_fp16x16_loop_neon + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0]) + vmull.u8 q8, d3, d0 + vmull.u8 q9, d5, d0 + vmull.u8 q10, d6, d0 + vmull.u8 q11, d8, d0 + vmull.u8 q12, d9, d0 + vmull.u8 q13, d11, d0 + vmull.u8 q14, d12, d0 + + vext.8 d2, d2, d3, #1 ;construct src_ptr[1] + vext.8 d5, d5, d6, #1 + vext.8 d8, d8, d9, #1 + vext.8 d11, d11, d12, #1 + + vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp8_filter[1]) + vmlal.u8 q9, d5, d1 + vmlal.u8 q11, d8, d1 + vmlal.u8 q13, d11, d1 + + vext.8 d3, d3, d4, #1 + vext.8 d6, d6, d7, #1 + vext.8 d9, d9, d10, #1 + vext.8 d12, d12, d13, #1 + + vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp8_filter[1]) + vmlal.u8 q10, d6, d1 + vmlal.u8 q12, d9, d1 + vmlal.u8 q14, d12, d1 + + subs r2, r2, #1 + + vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d15, q8, #7 + vqrshrn.u16 d16, q9, #7 + vqrshrn.u16 d17, q10, #7 + vqrshrn.u16 d18, q11, #7 + vqrshrn.u16 d19, q12, #7 + vqrshrn.u16 d20, q13, #7 + + vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data + vqrshrn.u16 d21, q14, #7 + vld1.u8 {d5, d6, d7}, [r0], r1 + + vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result + vld1.u8 {d8, d9, d10}, [r0], r1 + vst1.u8 {d18, d19, d20, d21}, [lr]! + vld1.u8 {d11, d12, d13}, [r0], r1 + + bne filt_blk2d_fp16x16_loop_neon + +;First-pass filtering for rest 5 lines + vld1.u8 {d14, d15, d16}, [r0], r1 + + vmull.u8 q9, d2, d0 ;(src_ptr[0] * vp8_filter[0]) + vmull.u8 q10, d3, d0 + vmull.u8 q11, d5, d0 + vmull.u8 q12, d6, d0 + vmull.u8 q13, d8, d0 + vmull.u8 q14, d9, d0 + + vext.8 d2, d2, d3, #1 ;construct src_ptr[1] + vext.8 d5, d5, d6, #1 + vext.8 d8, d8, d9, #1 + + vmlal.u8 q9, d2, d1 ;(src_ptr[0] * vp8_filter[1]) + vmlal.u8 q11, d5, d1 + vmlal.u8 q13, d8, d1 + + vext.8 d3, d3, d4, #1 + vext.8 d6, d6, d7, #1 + vext.8 d9, d9, d10, #1 + + vmlal.u8 q10, d3, d1 ;(src_ptr[0] * vp8_filter[1]) + vmlal.u8 q12, d6, d1 + vmlal.u8 q14, d9, d1 + + vmull.u8 q1, d11, d0 + vmull.u8 q2, d12, d0 + vmull.u8 q3, d14, d0 + vmull.u8 q4, d15, d0 + + vext.8 d11, d11, d12, #1 ;construct src_ptr[1] + vext.8 d14, d14, d15, #1 + + vmlal.u8 q1, d11, d1 ;(src_ptr[0] * vp8_filter[1]) + vmlal.u8 q3, d14, d1 + + vext.8 d12, d12, d13, #1 + vext.8 d15, d15, d16, #1 + + vmlal.u8 q2, d12, d1 ;(src_ptr[0] * vp8_filter[1]) + vmlal.u8 q4, d15, d1 + + vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d11, q10, #7 + vqrshrn.u16 d12, q11, #7 + vqrshrn.u16 d13, q12, #7 + vqrshrn.u16 d14, q13, #7 + vqrshrn.u16 d15, q14, #7 + vqrshrn.u16 d16, q1, #7 + vqrshrn.u16 d17, q2, #7 + vqrshrn.u16 d18, q3, #7 + vqrshrn.u16 d19, q4, #7 + + vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result + vst1.u8 {d14, d15, d16, d17}, [lr]! + vst1.u8 {d18, d19}, [lr]! + +;Second pass: 16x16 +;secondpass_filter + add r3, r12, r3, lsl #3 + sub lr, lr, #272 + + vld1.u32 {d31}, [r3] ;load second_pass filter + + vld1.u8 {d22, d23}, [lr]! ;load src data + + vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) + vdup.8 d1, d31[4] + mov r12, #4 ;loop counter + +filt_blk2d_sp16x16_loop_neon + vld1.u8 {d24, d25}, [lr]! + vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0]) + vld1.u8 {d26, d27}, [lr]! + vmull.u8 q2, d23, d0 + vld1.u8 {d28, d29}, [lr]! + vmull.u8 q3, d24, d0 + vld1.u8 {d30, d31}, [lr]! + + vmull.u8 q4, d25, d0 + vmull.u8 q5, d26, d0 + vmull.u8 q6, d27, d0 + vmull.u8 q7, d28, d0 + vmull.u8 q8, d29, d0 + + vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp8_filter[1]) + vmlal.u8 q2, d25, d1 + vmlal.u8 q3, d26, d1 + vmlal.u8 q4, d27, d1 + vmlal.u8 q5, d28, d1 + vmlal.u8 q6, d29, d1 + vmlal.u8 q7, d30, d1 + vmlal.u8 q8, d31, d1 + + subs r12, r12, #1 + + vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d3, q2, #7 + vqrshrn.u16 d4, q3, #7 + vqrshrn.u16 d5, q4, #7 + vqrshrn.u16 d6, q5, #7 + vqrshrn.u16 d7, q6, #7 + vqrshrn.u16 d8, q7, #7 + vqrshrn.u16 d9, q8, #7 + + vst1.u8 {d2, d3}, [r4], r5 ;store result + vst1.u8 {d4, d5}, [r4], r5 + vst1.u8 {d6, d7}, [r4], r5 + vmov q11, q15 + vst1.u8 {d8, d9}, [r4], r5 + + bne filt_blk2d_sp16x16_loop_neon + + add sp, sp, #272 + + pop {r4-r5,pc} + +;-------------------- +firstpass_bfilter16x16_only + mov r2, #4 ;loop counter + vdup.8 d0, d31[0] ;first_pass filter (d0 d1) + vdup.8 d1, d31[4] + +;First Pass: output_height lines x output_width columns (16x16) +filt_blk2d_fpo16x16_loop_neon + vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data + vld1.u8 {d5, d6, d7}, [r0], r1 + vld1.u8 {d8, d9, d10}, [r0], r1 + vld1.u8 {d11, d12, d13}, [r0], r1 + + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0]) + vmull.u8 q8, d3, d0 + vmull.u8 q9, d5, d0 + vmull.u8 q10, d6, d0 + vmull.u8 q11, d8, d0 + vmull.u8 q12, d9, d0 + vmull.u8 q13, d11, d0 + vmull.u8 q14, d12, d0 + + vext.8 d2, d2, d3, #1 ;construct src_ptr[1] + vext.8 d5, d5, d6, #1 + vext.8 d8, d8, d9, #1 + vext.8 d11, d11, d12, #1 + + vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp8_filter[1]) + vmlal.u8 q9, d5, d1 + vmlal.u8 q11, d8, d1 + vmlal.u8 q13, d11, d1 + + vext.8 d3, d3, d4, #1 + vext.8 d6, d6, d7, #1 + vext.8 d9, d9, d10, #1 + vext.8 d12, d12, d13, #1 + + vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp8_filter[1]) + vmlal.u8 q10, d6, d1 + vmlal.u8 q12, d9, d1 + vmlal.u8 q14, d12, d1 + + subs r2, r2, #1 + + vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d15, q8, #7 + vqrshrn.u16 d16, q9, #7 + vqrshrn.u16 d17, q10, #7 + vqrshrn.u16 d18, q11, #7 + vqrshrn.u16 d19, q12, #7 + vqrshrn.u16 d20, q13, #7 + vst1.u8 {d14, d15}, [r4], r5 ;store result + vqrshrn.u16 d21, q14, #7 + + vst1.u8 {d16, d17}, [r4], r5 + vst1.u8 {d18, d19}, [r4], r5 + vst1.u8 {d20, d21}, [r4], r5 + + bne filt_blk2d_fpo16x16_loop_neon + pop {r4-r5,pc} + +;--------------------- +secondpass_bfilter16x16_only +;Second pass: 16x16 +;secondpass_filter + add r3, r12, r3, lsl #3 + mov r12, #4 ;loop counter + vld1.u32 {d31}, [r3] ;load second_pass filter + vld1.u8 {d22, d23}, [r0], r1 ;load src data + + vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) + vdup.8 d1, d31[4] + +filt_blk2d_spo16x16_loop_neon + vld1.u8 {d24, d25}, [r0], r1 + vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0]) + vld1.u8 {d26, d27}, [r0], r1 + vmull.u8 q2, d23, d0 + vld1.u8 {d28, d29}, [r0], r1 + vmull.u8 q3, d24, d0 + vld1.u8 {d30, d31}, [r0], r1 + + vmull.u8 q4, d25, d0 + vmull.u8 q5, d26, d0 + vmull.u8 q6, d27, d0 + vmull.u8 q7, d28, d0 + vmull.u8 q8, d29, d0 + + vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp8_filter[1]) + vmlal.u8 q2, d25, d1 + vmlal.u8 q3, d26, d1 + vmlal.u8 q4, d27, d1 + vmlal.u8 q5, d28, d1 + vmlal.u8 q6, d29, d1 + vmlal.u8 q7, d30, d1 + vmlal.u8 q8, d31, d1 + + vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d3, q2, #7 + vqrshrn.u16 d4, q3, #7 + vqrshrn.u16 d5, q4, #7 + vqrshrn.u16 d6, q5, #7 + vqrshrn.u16 d7, q6, #7 + vqrshrn.u16 d8, q7, #7 + vqrshrn.u16 d9, q8, #7 + + vst1.u8 {d2, d3}, [r4], r5 ;store result + subs r12, r12, #1 + vst1.u8 {d4, d5}, [r4], r5 + vmov q11, q15 + vst1.u8 {d6, d7}, [r4], r5 + vst1.u8 {d8, d9}, [r4], r5 + + bne filt_blk2d_spo16x16_loop_neon + pop {r4-r5,pc} + + ENDP + +;----------------- + +bifilter16_coeff + DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 + + END diff --git a/vp8/common/arm/neon/bilinearpredict4x4_neon.asm b/vp8/common/arm/neon/bilinearpredict4x4_neon.asm new file mode 100644 index 0000000000000000000000000000000000000000..0ac62436f97dd178b1552c5805f754bc1dd03757 --- /dev/null +++ b/vp8/common/arm/neon/bilinearpredict4x4_neon.asm @@ -0,0 +1,130 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_bilinear_predict4x4_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; r4 unsigned char *dst_ptr, +; stack(lr) int dst_pitch + +|vp8_bilinear_predict4x4_neon| PROC + push {r4, lr} + + adr r12, bifilter4_coeff + ldr r4, [sp, #8] ;load parameters from stack + ldr lr, [sp, #12] ;load parameters from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq skip_firstpass_filter + +;First pass: output_height lines x output_width columns (5x4) + vld1.u8 {d2}, [r0], r1 ;load src data + add r2, r12, r2, lsl #3 ;calculate Hfilter location (2coeffsx4bytes=8bytes) + + vld1.u8 {d3}, [r0], r1 + vld1.u32 {d31}, [r2] ;first_pass filter + + vld1.u8 {d4}, [r0], r1 + vdup.8 d0, d31[0] ;first_pass filter (d0-d1) + vld1.u8 {d5}, [r0], r1 + vdup.8 d1, d31[4] + vld1.u8 {d6}, [r0], r1 + + vshr.u64 q4, q1, #8 ;construct src_ptr[1] + vshr.u64 q5, q2, #8 + vshr.u64 d12, d6, #8 + + vzip.32 d2, d3 ;put 2-line data in 1 register (src_ptr[0]) + vzip.32 d4, d5 + vzip.32 d8, d9 ;put 2-line data in 1 register (src_ptr[1]) + vzip.32 d10, d11 + + vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0]) + vmull.u8 q8, d4, d0 + vmull.u8 q9, d6, d0 + + vmlal.u8 q7, d8, d1 ;(src_ptr[1] * vp8_filter[1]) + vmlal.u8 q8, d10, d1 + vmlal.u8 q9, d12, d1 + + vqrshrn.u16 d28, q7, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d29, q8, #7 + vqrshrn.u16 d30, q9, #7 + +;Second pass: 4x4 +secondpass_filter + cmp r3, #0 ;skip second_pass filter if yoffset=0 + beq skip_secondpass_filter + + add r3, r12, r3, lsl #3 ;calculate Vfilter location + vld1.u32 {d31}, [r3] ;load second_pass filter + + vdup.8 d0, d31[0] ;second_pass filter parameters (d0-d5) + vdup.8 d1, d31[4] + + vmull.u8 q1, d28, d0 + vmull.u8 q2, d29, d0 + + vext.8 d26, d28, d29, #4 ;construct src_ptr[pixel_step] + vext.8 d27, d29, d30, #4 + + vmlal.u8 q1, d26, d1 + vmlal.u8 q2, d27, d1 + + add r0, r4, lr + add r1, r0, lr + add r2, r1, lr + + vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d3, q2, #7 + + vst1.32 {d2[0]}, [r4] ;store result + vst1.32 {d2[1]}, [r0] + vst1.32 {d3[0]}, [r1] + vst1.32 {d3[1]}, [r2] + + pop {r4, pc} + +;-------------------- +skip_firstpass_filter + + vld1.32 {d28[0]}, [r0], r1 ;load src data + vld1.32 {d28[1]}, [r0], r1 + vld1.32 {d29[0]}, [r0], r1 + vld1.32 {d29[1]}, [r0], r1 + vld1.32 {d30[0]}, [r0], r1 + + b secondpass_filter + +;--------------------- +skip_secondpass_filter + vst1.32 {d28[0]}, [r4], lr ;store result + vst1.32 {d28[1]}, [r4], lr + vst1.32 {d29[0]}, [r4], lr + vst1.32 {d29[1]}, [r4], lr + + pop {r4, pc} + + ENDP + +;----------------- + +bifilter4_coeff + DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 + + END diff --git a/vp8/common/arm/neon/bilinearpredict8x4_neon.asm b/vp8/common/arm/neon/bilinearpredict8x4_neon.asm new file mode 100644 index 0000000000000000000000000000000000000000..41f5c45ffe7bf429ee28943073fc2d049e937a26 --- /dev/null +++ b/vp8/common/arm/neon/bilinearpredict8x4_neon.asm @@ -0,0 +1,135 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_bilinear_predict8x4_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; r4 unsigned char *dst_ptr, +; stack(lr) int dst_pitch + +|vp8_bilinear_predict8x4_neon| PROC + push {r4, lr} + + adr r12, bifilter8x4_coeff + ldr r4, [sp, #8] ;load parameters from stack + ldr lr, [sp, #12] ;load parameters from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq skip_firstpass_filter + +;First pass: output_height lines x output_width columns (5x8) + add r2, r12, r2, lsl #3 ;calculate filter location + + vld1.u8 {q1}, [r0], r1 ;load src data + vld1.u32 {d31}, [r2] ;load first_pass filter + vld1.u8 {q2}, [r0], r1 + vdup.8 d0, d31[0] ;first_pass filter (d0 d1) + vld1.u8 {q3}, [r0], r1 + vdup.8 d1, d31[4] + vld1.u8 {q4}, [r0], r1 + + vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0]) + vld1.u8 {q5}, [r0], r1 + vmull.u8 q7, d4, d0 + vmull.u8 q8, d6, d0 + vmull.u8 q9, d8, d0 + vmull.u8 q10, d10, d0 + + vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 + vext.8 d9, d8, d9, #1 + vext.8 d11, d10, d11, #1 + + vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1]) + vmlal.u8 q7, d5, d1 + vmlal.u8 q8, d7, d1 + vmlal.u8 q9, d9, d1 + vmlal.u8 q10, d11, d1 + + vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d23, q7, #7 + vqrshrn.u16 d24, q8, #7 + vqrshrn.u16 d25, q9, #7 + vqrshrn.u16 d26, q10, #7 + +;Second pass: 4x8 +secondpass_filter + cmp r3, #0 ;skip second_pass filter if yoffset=0 + beq skip_secondpass_filter + + add r3, r12, r3, lsl #3 + add r0, r4, lr + + vld1.u32 {d31}, [r3] ;load second_pass filter + add r1, r0, lr + + vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) + vdup.8 d1, d31[4] + + vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0]) + vmull.u8 q2, d23, d0 + vmull.u8 q3, d24, d0 + vmull.u8 q4, d25, d0 + + vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp8_filter[1]) + vmlal.u8 q2, d24, d1 + vmlal.u8 q3, d25, d1 + vmlal.u8 q4, d26, d1 + + add r2, r1, lr + + vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d3, q2, #7 + vqrshrn.u16 d4, q3, #7 + vqrshrn.u16 d5, q4, #7 + + vst1.u8 {d2}, [r4] ;store result + vst1.u8 {d3}, [r0] + vst1.u8 {d4}, [r1] + vst1.u8 {d5}, [r2] + + pop {r4, pc} + +;-------------------- +skip_firstpass_filter + vld1.u8 {d22}, [r0], r1 ;load src data + vld1.u8 {d23}, [r0], r1 + vld1.u8 {d24}, [r0], r1 + vld1.u8 {d25}, [r0], r1 + vld1.u8 {d26}, [r0], r1 + + b secondpass_filter + +;--------------------- +skip_secondpass_filter + vst1.u8 {d22}, [r4], lr ;store result + vst1.u8 {d23}, [r4], lr + vst1.u8 {d24}, [r4], lr + vst1.u8 {d25}, [r4], lr + + pop {r4, pc} + + ENDP + +;----------------- + +bifilter8x4_coeff + DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 + + END diff --git a/vp8/common/arm/neon/bilinearpredict8x8_neon.asm b/vp8/common/arm/neon/bilinearpredict8x8_neon.asm new file mode 100644 index 0000000000000000000000000000000000000000..c4711bc4d4ad91a54c54d058603fba2c60dc4e79 --- /dev/null +++ b/vp8/common/arm/neon/bilinearpredict8x8_neon.asm @@ -0,0 +1,183 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_bilinear_predict8x8_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; r4 unsigned char *dst_ptr, +; stack(lr) int dst_pitch + +|vp8_bilinear_predict8x8_neon| PROC + push {r4, lr} + + adr r12, bifilter8_coeff + ldr r4, [sp, #8] ;load parameters from stack + ldr lr, [sp, #12] ;load parameters from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq skip_firstpass_filter + +;First pass: output_height lines x output_width columns (9x8) + add r2, r12, r2, lsl #3 ;calculate filter location + + vld1.u8 {q1}, [r0], r1 ;load src data + vld1.u32 {d31}, [r2] ;load first_pass filter + vld1.u8 {q2}, [r0], r1 + vdup.8 d0, d31[0] ;first_pass filter (d0 d1) + vld1.u8 {q3}, [r0], r1 + vdup.8 d1, d31[4] + vld1.u8 {q4}, [r0], r1 + + vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0]) + vmull.u8 q7, d4, d0 + vmull.u8 q8, d6, d0 + vmull.u8 q9, d8, d0 + + vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 + vext.8 d9, d8, d9, #1 + + vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1]) + vmlal.u8 q7, d5, d1 + vmlal.u8 q8, d7, d1 + vmlal.u8 q9, d9, d1 + + vld1.u8 {q1}, [r0], r1 ;load src data + vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8 + vld1.u8 {q2}, [r0], r1 + vqrshrn.u16 d23, q7, #7 + vld1.u8 {q3}, [r0], r1 + vqrshrn.u16 d24, q8, #7 + vld1.u8 {q4}, [r0], r1 + vqrshrn.u16 d25, q9, #7 + + ;first_pass filtering on the rest 5-line data + vld1.u8 {q5}, [r0], r1 + + vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0]) + vmull.u8 q7, d4, d0 + vmull.u8 q8, d6, d0 + vmull.u8 q9, d8, d0 + vmull.u8 q10, d10, d0 + + vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 + vext.8 d9, d8, d9, #1 + vext.8 d11, d10, d11, #1 + + vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1]) + vmlal.u8 q7, d5, d1 + vmlal.u8 q8, d7, d1 + vmlal.u8 q9, d9, d1 + vmlal.u8 q10, d11, d1 + + vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d27, q7, #7 + vqrshrn.u16 d28, q8, #7 + vqrshrn.u16 d29, q9, #7 + vqrshrn.u16 d30, q10, #7 + +;Second pass: 8x8 +secondpass_filter + cmp r3, #0 ;skip second_pass filter if yoffset=0 + beq skip_secondpass_filter + + add r3, r12, r3, lsl #3 + add r0, r4, lr + + vld1.u32 {d31}, [r3] ;load second_pass filter + add r1, r0, lr + + vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) + vdup.8 d1, d31[4] + + vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0]) + vmull.u8 q2, d23, d0 + vmull.u8 q3, d24, d0 + vmull.u8 q4, d25, d0 + vmull.u8 q5, d26, d0 + vmull.u8 q6, d27, d0 + vmull.u8 q7, d28, d0 + vmull.u8 q8, d29, d0 + + vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp8_filter[1]) + vmlal.u8 q2, d24, d1 + vmlal.u8 q3, d25, d1 + vmlal.u8 q4, d26, d1 + vmlal.u8 q5, d27, d1 + vmlal.u8 q6, d28, d1 + vmlal.u8 q7, d29, d1 + vmlal.u8 q8, d30, d1 + + vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d3, q2, #7 + vqrshrn.u16 d4, q3, #7 + vqrshrn.u16 d5, q4, #7 + vqrshrn.u16 d6, q5, #7 + vqrshrn.u16 d7, q6, #7 + vqrshrn.u16 d8, q7, #7 + vqrshrn.u16 d9, q8, #7 + + vst1.u8 {d2}, [r4] ;store result + vst1.u8 {d3}, [r0] + vst1.u8 {d4}, [r1], lr + vst1.u8 {d5}, [r1], lr + vst1.u8 {d6}, [r1], lr + vst1.u8 {d7}, [r1], lr + vst1.u8 {d8}, [r1], lr + vst1.u8 {d9}, [r1], lr + + pop {r4, pc} + +;-------------------- +skip_firstpass_filter + vld1.u8 {d22}, [r0], r1 ;load src data + vld1.u8 {d23}, [r0], r1 + vld1.u8 {d24}, [r0], r1 + vld1.u8 {d25}, [r0], r1 + vld1.u8 {d26}, [r0], r1 + vld1.u8 {d27}, [r0], r1 + vld1.u8 {d28}, [r0], r1 + vld1.u8 {d29}, [r0], r1 + vld1.u8 {d30}, [r0], r1 + + b secondpass_filter + +;--------------------- +skip_secondpass_filter + vst1.u8 {d22}, [r4], lr ;store result + vst1.u8 {d23}, [r4], lr + vst1.u8 {d24}, [r4], lr + vst1.u8 {d25}, [r4], lr + vst1.u8 {d26}, [r4], lr + vst1.u8 {d27}, [r4], lr + vst1.u8 {d28}, [r4], lr + vst1.u8 {d29}, [r4], lr + + pop {r4, pc} + + ENDP + +;----------------- + +bifilter8_coeff + DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 + + END diff --git a/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm b/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm new file mode 100644 index 0000000000000000000000000000000000000000..e3ea91fe6c0493dc8e5c510f34fe7ce77e598b53 --- /dev/null +++ b/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm @@ -0,0 +1,584 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_build_intra_predictors_mby_neon_func| + EXPORT |vp8_build_intra_predictors_mby_s_neon_func| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *y_buffer +; r1 unsigned char *ypred_ptr +; r2 int y_stride +; r3 int mode +; stack int Up +; stack int Left + +|vp8_build_intra_predictors_mby_neon_func| PROC + push {r4-r8, lr} + + cmp r3, #0 + beq case_dc_pred + cmp r3, #1 + beq case_v_pred + cmp r3, #2 + beq case_h_pred + cmp r3, #3 + beq case_tm_pred + +case_dc_pred + ldr r4, [sp, #24] ; Up + ldr r5, [sp, #28] ; Left + + ; Default the DC average to 128 + mov r12, #128 + vdup.u8 q0, r12 + + ; Zero out running sum + mov r12, #0 + + ; compute shift and jump + adds r7, r4, r5 + beq skip_dc_pred_up_left + + ; Load above row, if it exists + cmp r4, #0 + beq skip_dc_pred_up + + sub r6, r0, r2 + vld1.8 {q1}, [r6] + vpaddl.u8 q2, q1 + vpaddl.u16 q3, q2 + vpaddl.u32 q4, q3 + + vmov.32 r4, d8[0] + vmov.32 r6, d9[0] + + add r12, r4, r6 + + ; Move back to interger registers + +skip_dc_pred_up + + cmp r5, #0 + beq skip_dc_pred_left + + sub r0, r0, #1 + + ; Load left row, if it exists + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + + add r12, r12, r3 + add r12, r12, r4 + add r12, r12, r5 + add r12, r12, r6 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + + add r12, r12, r3 + add r12, r12, r4 + add r12, r12, r5 + add r12, r12, r6 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + + add r12, r12, r3 + add r12, r12, r4 + add r12, r12, r5 + add r12, r12, r6 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0] + + add r12, r12, r3 + add r12, r12, r4 + add r12, r12, r5 + add r12, r12, r6 + +skip_dc_pred_left + add r7, r7, #3 ; Shift + sub r4, r7, #1 + mov r5, #1 + add r12, r12, r5, lsl r4 + mov r5, r12, lsr r7 ; expected_dc + + vdup.u8 q0, r5 + +skip_dc_pred_up_left + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + + pop {r4-r8,pc} +case_v_pred + ; Copy down above row + sub r6, r0, r2 + vld1.8 {q0}, [r6] + + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + pop {r4-r8,pc} + +case_h_pred + ; Load 4x yleft_col + sub r0, r0, #1 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u8 q0, r3 + vdup.u8 q1, r4 + vdup.u8 q2, r5 + vdup.u8 q3, r6 + vst1.u8 {q0}, [r1]! + vst1.u8 {q1}, [r1]! + vst1.u8 {q2}, [r1]! + vst1.u8 {q3}, [r1]! + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u8 q0, r3 + vdup.u8 q1, r4 + vdup.u8 q2, r5 + vdup.u8 q3, r6 + vst1.u8 {q0}, [r1]! + vst1.u8 {q1}, [r1]! + vst1.u8 {q2}, [r1]! + vst1.u8 {q3}, [r1]! + + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u8 q0, r3 + vdup.u8 q1, r4 + vdup.u8 q2, r5 + vdup.u8 q3, r6 + vst1.u8 {q0}, [r1]! + vst1.u8 {q1}, [r1]! + vst1.u8 {q2}, [r1]! + vst1.u8 {q3}, [r1]! + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u8 q0, r3 + vdup.u8 q1, r4 + vdup.u8 q2, r5 + vdup.u8 q3, r6 + vst1.u8 {q0}, [r1]! + vst1.u8 {q1}, [r1]! + vst1.u8 {q2}, [r1]! + vst1.u8 {q3}, [r1]! + + pop {r4-r8,pc} + +case_tm_pred + ; Load yabove_row + sub r3, r0, r2 + vld1.8 {q8}, [r3] + + ; Load ytop_left + sub r3, r3, #1 + ldrb r7, [r3] + + vdup.u16 q7, r7 + + ; Compute yabove_row - ytop_left + mov r3, #1 + vdup.u8 q0, r3 + + vmull.u8 q4, d16, d0 + vmull.u8 q5, d17, d0 + + vsub.s16 q4, q4, q7 + vsub.s16 q5, q5, q7 + + ; Load 4x yleft_col + sub r0, r0, #1 + mov r12, #4 + +case_tm_pred_loop + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u16 q0, r3 + vdup.u16 q1, r4 + vdup.u16 q2, r5 + vdup.u16 q3, r6 + + vqadd.s16 q8, q0, q4 + vqadd.s16 q9, q0, q5 + + vqadd.s16 q10, q1, q4 + vqadd.s16 q11, q1, q5 + + vqadd.s16 q12, q2, q4 + vqadd.s16 q13, q2, q5 + + vqadd.s16 q14, q3, q4 + vqadd.s16 q15, q3, q5 + + vqshrun.s16 d0, q8, #0 + vqshrun.s16 d1, q9, #0 + + vqshrun.s16 d2, q10, #0 + vqshrun.s16 d3, q11, #0 + + vqshrun.s16 d4, q12, #0 + vqshrun.s16 d5, q13, #0 + + vqshrun.s16 d6, q14, #0 + vqshrun.s16 d7, q15, #0 + + vst1.u8 {q0}, [r1]! + vst1.u8 {q1}, [r1]! + vst1.u8 {q2}, [r1]! + vst1.u8 {q3}, [r1]! + + subs r12, r12, #1 + bne case_tm_pred_loop + + pop {r4-r8,pc} + + ENDP + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; r0 unsigned char *y_buffer +; r1 unsigned char *ypred_ptr +; r2 int y_stride +; r3 int mode +; stack int Up +; stack int Left + +|vp8_build_intra_predictors_mby_s_neon_func| PROC + push {r4-r8, lr} + + mov r1, r0 ; unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor; + + cmp r3, #0 + beq case_dc_pred_s + cmp r3, #1 + beq case_v_pred_s + cmp r3, #2 + beq case_h_pred_s + cmp r3, #3 + beq case_tm_pred_s + +case_dc_pred_s + ldr r4, [sp, #24] ; Up + ldr r5, [sp, #28] ; Left + + ; Default the DC average to 128 + mov r12, #128 + vdup.u8 q0, r12 + + ; Zero out running sum + mov r12, #0 + + ; compute shift and jump + adds r7, r4, r5 + beq skip_dc_pred_up_left_s + + ; Load above row, if it exists + cmp r4, #0 + beq skip_dc_pred_up_s + + sub r6, r0, r2 + vld1.8 {q1}, [r6] + vpaddl.u8 q2, q1 + vpaddl.u16 q3, q2 + vpaddl.u32 q4, q3 + + vmov.32 r4, d8[0] + vmov.32 r6, d9[0] + + add r12, r4, r6 + + ; Move back to interger registers + +skip_dc_pred_up_s + + cmp r5, #0 + beq skip_dc_pred_left_s + + sub r0, r0, #1 + + ; Load left row, if it exists + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + + add r12, r12, r3 + add r12, r12, r4 + add r12, r12, r5 + add r12, r12, r6 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + + add r12, r12, r3 + add r12, r12, r4 + add r12, r12, r5 + add r12, r12, r6 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + + add r12, r12, r3 + add r12, r12, r4 + add r12, r12, r5 + add r12, r12, r6 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0] + + add r12, r12, r3 + add r12, r12, r4 + add r12, r12, r5 + add r12, r12, r6 + +skip_dc_pred_left_s + add r7, r7, #3 ; Shift + sub r4, r7, #1 + mov r5, #1 + add r12, r12, r5, lsl r4 + mov r5, r12, lsr r7 ; expected_dc + + vdup.u8 q0, r5 + +skip_dc_pred_up_left_s + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + + pop {r4-r8,pc} +case_v_pred_s + ; Copy down above row + sub r6, r0, r2 + vld1.8 {q0}, [r6] + + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + pop {r4-r8,pc} + +case_h_pred_s + ; Load 4x yleft_col + sub r0, r0, #1 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u8 q0, r3 + vdup.u8 q1, r4 + vdup.u8 q2, r5 + vdup.u8 q3, r6 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q1}, [r1], r2 + vst1.u8 {q2}, [r1], r2 + vst1.u8 {q3}, [r1], r2 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u8 q0, r3 + vdup.u8 q1, r4 + vdup.u8 q2, r5 + vdup.u8 q3, r6 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q1}, [r1], r2 + vst1.u8 {q2}, [r1], r2 + vst1.u8 {q3}, [r1], r2 + + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u8 q0, r3 + vdup.u8 q1, r4 + vdup.u8 q2, r5 + vdup.u8 q3, r6 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q1}, [r1], r2 + vst1.u8 {q2}, [r1], r2 + vst1.u8 {q3}, [r1], r2 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u8 q0, r3 + vdup.u8 q1, r4 + vdup.u8 q2, r5 + vdup.u8 q3, r6 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q1}, [r1], r2 + vst1.u8 {q2}, [r1], r2 + vst1.u8 {q3}, [r1], r2 + + pop {r4-r8,pc} + +case_tm_pred_s + ; Load yabove_row + sub r3, r0, r2 + vld1.8 {q8}, [r3] + + ; Load ytop_left + sub r3, r3, #1 + ldrb r7, [r3] + + vdup.u16 q7, r7 + + ; Compute yabove_row - ytop_left + mov r3, #1 + vdup.u8 q0, r3 + + vmull.u8 q4, d16, d0 + vmull.u8 q5, d17, d0 + + vsub.s16 q4, q4, q7 + vsub.s16 q5, q5, q7 + + ; Load 4x yleft_col + sub r0, r0, #1 + mov r12, #4 + +case_tm_pred_loop_s + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u16 q0, r3 + vdup.u16 q1, r4 + vdup.u16 q2, r5 + vdup.u16 q3, r6 + + vqadd.s16 q8, q0, q4 + vqadd.s16 q9, q0, q5 + + vqadd.s16 q10, q1, q4 + vqadd.s16 q11, q1, q5 + + vqadd.s16 q12, q2, q4 + vqadd.s16 q13, q2, q5 + + vqadd.s16 q14, q3, q4 + vqadd.s16 q15, q3, q5 + + vqshrun.s16 d0, q8, #0 + vqshrun.s16 d1, q9, #0 + + vqshrun.s16 d2, q10, #0 + vqshrun.s16 d3, q11, #0 + + vqshrun.s16 d4, q12, #0 + vqshrun.s16 d5, q13, #0 + + vqshrun.s16 d6, q14, #0 + vqshrun.s16 d7, q15, #0 + + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q1}, [r1], r2 + vst1.u8 {q2}, [r1], r2 + vst1.u8 {q3}, [r1], r2 + + subs r12, r12, #1 + bne case_tm_pred_loop_s + + pop {r4-r8,pc} + + ENDP + + + END diff --git a/vp8/common/arm/neon/copymem16x16_neon.asm b/vp8/common/arm/neon/copymem16x16_neon.asm new file mode 100644 index 0000000000000000000000000000000000000000..bda4b965442f80c1c67bf3b1364632f9f2513e44 --- /dev/null +++ b/vp8/common/arm/neon/copymem16x16_neon.asm @@ -0,0 +1,59 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_copy_mem16x16_neon| + ; ARM + ; REQUIRE8 + ; PRESERVE8 + + AREA Block, CODE, READONLY ; name this block of code +;void copy_mem16x16_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp8_copy_mem16x16_neon| PROC + + vld1.u8 {q0}, [r0], r1 + vld1.u8 {q1}, [r0], r1 + vld1.u8 {q2}, [r0], r1 + vst1.u8 {q0}, [r2], r3 + vld1.u8 {q3}, [r0], r1 + vst1.u8 {q1}, [r2], r3 + vld1.u8 {q4}, [r0], r1 + vst1.u8 {q2}, [r2], r3 + vld1.u8 {q5}, [r0], r1 + vst1.u8 {q3}, [r2], r3 + vld1.u8 {q6}, [r0], r1 + vst1.u8 {q4}, [r2], r3 + vld1.u8 {q7}, [r0], r1 + vst1.u8 {q5}, [r2], r3 + vld1.u8 {q8}, [r0], r1 + vst1.u8 {q6}, [r2], r3 + vld1.u8 {q9}, [r0], r1 + vst1.u8 {q7}, [r2], r3 + vld1.u8 {q10}, [r0], r1 + vst1.u8 {q8}, [r2], r3 + vld1.u8 {q11}, [r0], r1 + vst1.u8 {q9}, [r2], r3 + vld1.u8 {q12}, [r0], r1 + vst1.u8 {q10}, [r2], r3 + vld1.u8 {q13}, [r0], r1 + vst1.u8 {q11}, [r2], r3 + vld1.u8 {q14}, [r0], r1 + vst1.u8 {q12}, [r2], r3 + vld1.u8 {q15}, [r0], r1 + vst1.u8 {q13}, [r2], r3 + vst1.u8 {q14}, [r2], r3 + vst1.u8 {q15}, [r2], r3 + + mov pc, lr + + ENDP ; |vp8_copy_mem16x16_neon| + + END diff --git a/vp8/common/arm/neon/copymem8x4_neon.asm b/vp8/common/arm/neon/copymem8x4_neon.asm new file mode 100644 index 0000000000000000000000000000000000000000..35c0f6708a551f76f86ce4c45cea063d66fdec55 --- /dev/null +++ b/vp8/common/arm/neon/copymem8x4_neon.asm @@ -0,0 +1,34 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_copy_mem8x4_neon| + ; ARM + ; REQUIRE8 + ; PRESERVE8 + + AREA Block, CODE, READONLY ; name this block of code +;void copy_mem8x4_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp8_copy_mem8x4_neon| PROC + vld1.u8 {d0}, [r0], r1 + vld1.u8 {d1}, [r0], r1 + vst1.u8 {d0}, [r2], r3 + vld1.u8 {d2}, [r0], r1 + vst1.u8 {d1}, [r2], r3 + vld1.u8 {d3}, [r0], r1 + vst1.u8 {d2}, [r2], r3 + vst1.u8 {d3}, [r2], r3 + + mov pc, lr + + ENDP ; |vp8_copy_mem8x4_neon| + + END diff --git a/vp8/common/arm/neon/copymem8x8_neon.asm b/vp8/common/arm/neon/copymem8x8_neon.asm new file mode 100644 index 0000000000000000000000000000000000000000..1f5b9411bb5e084230039d549d55c39259fde158 --- /dev/null +++ b/vp8/common/arm/neon/copymem8x8_neon.asm @@ -0,0 +1,43 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_copy_mem8x8_neon| + ; ARM + ; REQUIRE8 + ; PRESERVE8 + + AREA Block, CODE, READONLY ; name this block of code +;void copy_mem8x8_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp8_copy_mem8x8_neon| PROC + + vld1.u8 {d0}, [r0], r1 + vld1.u8 {d1}, [r0], r1 + vst1.u8 {d0}, [r2], r3 + vld1.u8 {d2}, [r0], r1 + vst1.u8 {d1}, [r2], r3 + vld1.u8 {d3}, [r0], r1 + vst1.u8 {d2}, [r2], r3 + vld1.u8 {d4}, [r0], r1 + vst1.u8 {d3}, [r2], r3 + vld1.u8 {d5}, [r0], r1 + vst1.u8 {d4}, [r2], r3 + vld1.u8 {d6}, [r0], r1 + vst1.u8 {d5}, [r2], r3 + vld1.u8 {d7}, [r0], r1 + vst1.u8 {d6}, [r2], r3 + vst1.u8 {d7}, [r2], r3 + + mov pc, lr + + ENDP ; |vp8_copy_mem8x8_neon| + + END diff --git a/vp8/common/arm/neon/dc_only_idct_add_neon.asm b/vp8/common/arm/neon/dc_only_idct_add_neon.asm new file mode 100644 index 0000000000000000000000000000000000000000..79ff02c69401998c686f9f20c42239caf1f1406c --- /dev/null +++ b/vp8/common/arm/neon/dc_only_idct_add_neon.asm @@ -0,0 +1,54 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_dc_only_idct_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, +; int pred_stride, unsigned char *dst_ptr, +; int dst_stride) + +; r0 input_dc +; r1 pred_ptr +; r2 pred_stride +; r3 dst_ptr +; sp dst_stride + +|vp8_dc_only_idct_add_neon| PROC + add r0, r0, #4 + asr r0, r0, #3 + ldr r12, [sp] + vdup.16 q0, r0 + + vld1.32 {d2[0]}, [r1], r2 + vld1.32 {d2[1]}, [r1], r2 + vld1.32 {d4[0]}, [r1], r2 + vld1.32 {d4[1]}, [r1] + + vaddw.u8 q1, q0, d2 + vaddw.u8 q2, q0, d4 + + vqmovun.s16 d2, q1 + vqmovun.s16 d4, q2 + + vst1.32 {d2[0]}, [r3], r12 + vst1.32 {d2[1]}, [r3], r12 + vst1.32 {d4[0]}, [r3], r12 + vst1.32 {d4[1]}, [r3] + + bx lr + + ENDP + + END diff --git a/vp8/common/arm/neon/dequant_idct_neon.asm b/vp8/common/arm/neon/dequant_idct_neon.asm new file mode 100644 index 0000000000000000000000000000000000000000..602cce67697381d26d11db57359f7261c397d3c9 --- /dev/null +++ b/vp8/common/arm/neon/dequant_idct_neon.asm @@ -0,0 +1,131 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_dequant_idct_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;void vp8_dequant_idct_add_neon(short *input, short *dq, +; unsigned char *dest, int stride) +; r0 short *input, +; r1 short *dq, +; r2 unsigned char *dest +; r3 int stride + +|vp8_dequant_idct_add_neon| PROC + vld1.16 {q3, q4}, [r0] + vld1.16 {q5, q6}, [r1] + + add r1, r2, r3 ; r1 = dest + stride + lsl r3, #1 ; 2x stride + + vld1.32 {d14[0]}, [r2], r3 + vld1.32 {d14[1]}, [r1], r3 + vld1.32 {d15[0]}, [r2] + vld1.32 {d15[1]}, [r1] + + adr r12, cospi8sqrt2minus1 ; pointer to the first constant + + vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon + vmul.i16 q2, q4, q6 + +;|short_idct4x4llm_neon| PROC + vld1.16 {d0}, [r12] + vswp d3, d4 ;q2(vp[4] vp[12]) + + vqdmulh.s16 q3, q2, d0[2] + vqdmulh.s16 q4, q2, d0[0] + + vqadd.s16 d12, d2, d3 ;a1 + vqsub.s16 d13, d2, d3 ;b1 + + vshr.s16 q3, q3, #1 + vshr.s16 q4, q4, #1 + + vqadd.s16 q3, q3, q2 + vqadd.s16 q4, q4, q2 + + vqsub.s16 d10, d6, d9 ;c1 + vqadd.s16 d11, d7, d8 ;d1 + + vqadd.s16 d2, d12, d11 + vqadd.s16 d3, d13, d10 + vqsub.s16 d4, d13, d10 + vqsub.s16 d5, d12, d11 + + vtrn.32 d2, d4 + vtrn.32 d3, d5 + vtrn.16 d2, d3 + vtrn.16 d4, d5 + +; memset(input, 0, 32) -- 32bytes + vmov.i16 q14, #0 + + vswp d3, d4 + vqdmulh.s16 q3, q2, d0[2] + vqdmulh.s16 q4, q2, d0[0] + + vqadd.s16 d12, d2, d3 ;a1 + vqsub.s16 d13, d2, d3 ;b1 + + vmov q15, q14 + + vshr.s16 q3, q3, #1 + vshr.s16 q4, q4, #1 + + vqadd.s16 q3, q3, q2 + vqadd.s16 q4, q4, q2 + + vqsub.s16 d10, d6, d9 ;c1 + vqadd.s16 d11, d7, d8 ;d1 + + vqadd.s16 d2, d12, d11 + vqadd.s16 d3, d13, d10 + vqsub.s16 d4, d13, d10 + vqsub.s16 d5, d12, d11 + + vst1.16 {q14, q15}, [r0] + + vrshr.s16 d2, d2, #3 + vrshr.s16 d3, d3, #3 + vrshr.s16 d4, d4, #3 + vrshr.s16 d5, d5, #3 + + vtrn.32 d2, d4 + vtrn.32 d3, d5 + vtrn.16 d2, d3 + vtrn.16 d4, d5 + + vaddw.u8 q1, q1, d14 + vaddw.u8 q2, q2, d15 + + sub r2, r2, r3 + sub r1, r1, r3 + + vqmovun.s16 d0, q1 + vqmovun.s16 d1, q2 + + vst1.32 {d0[0]}, [r2], r3 + vst1.32 {d0[1]}, [r1], r3 + vst1.32 {d1[0]}, [r2] + vst1.32 {d1[1]}, [r1] + + bx lr + + ENDP ; |vp8_dequant_idct_add_neon| + +; Constant Pool +cospi8sqrt2minus1 DCD 0x4e7b4e7b +sinpi8sqrt2 DCD 0x8a8c8a8c + + END diff --git a/vp8/common/arm/neon/dequantizeb_neon.asm b/vp8/common/arm/neon/dequantizeb_neon.asm new file mode 100644 index 0000000000000000000000000000000000000000..c8e0c31f29c962ca1abeabfe0e72590ac9b858d4 --- /dev/null +++ b/vp8/common/arm/neon/dequantizeb_neon.asm @@ -0,0 +1,34 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_dequantize_b_loop_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 short *Q, +; r1 short *DQC +; r2 short *DQ +|vp8_dequantize_b_loop_neon| PROC + vld1.16 {q0, q1}, [r0] + vld1.16 {q2, q3}, [r1] + + vmul.i16 q4, q0, q2 + vmul.i16 q5, q1, q3 + + vst1.16 {q4, q5}, [r2] + + bx lr + + ENDP + + END diff --git a/vp8/common/arm/neon/idct_blk_neon.c b/vp8/common/arm/neon/idct_blk_neon.c new file mode 100644 index 0000000000000000000000000000000000000000..ee7f223b55b224d916f6b314fa08e5b18e923066 --- /dev/null +++ b/vp8/common/arm/neon/idct_blk_neon.c @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vpx_rtcd.h" + +/* place these declarations here because we don't want to maintain them + * outside of this scope + */ +void idct_dequant_full_2x_neon(short *q, short *dq, + unsigned char *dst, int stride); +void idct_dequant_0_2x_neon(short *q, short dq, + unsigned char *dst, int stride); + + +void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, + unsigned char *dst, + int stride, char *eobs) +{ + int i; + + for (i = 0; i < 4; i++) + { + if (((short *)(eobs))[0]) + { + if (((short *)eobs)[0] & 0xfefe) + idct_dequant_full_2x_neon (q, dq, dst, stride); + else + idct_dequant_0_2x_neon (q, dq[0], dst, stride); + } + + if (((short *)(eobs))[1]) + { + if (((short *)eobs)[1] & 0xfefe) + idct_dequant_full_2x_neon (q+32, dq, dst+8, stride); + else + idct_dequant_0_2x_neon (q+32, dq[0], dst+8, stride); + } + q += 64; + dst += 4*stride; + eobs += 4; + } +} + +void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq, + unsigned char *dstu, + unsigned char *dstv, + int stride, char *eobs) +{ + if (((short *)(eobs))[0]) + { + if (((short *)eobs)[0] & 0xfefe) + idct_dequant_full_2x_neon (q, dq, dstu, stride); + else + idct_dequant_0_2x_neon (q, dq[0], dstu, stride); + } + + q += 32; + dstu += 4*stride; + + if (((short *)(eobs))[1]) + { + if (((short *)eobs)[1] & 0xfefe) + idct_dequant_full_2x_neon (q, dq, dstu, stride); + else + idct_dequant_0_2x_neon (q, dq[0], dstu, stride); + } + + q += 32; + + if (((short *)(eobs))[2]) + { + if (((short *)eobs)[2] & 0xfefe) + idct_dequant_full_2x_neon (q, dq, dstv, stride); + else + idct_dequant_0_2x_neon (q, dq[0], dstv, stride); + } + + q += 32; + dstv += 4*stride; + + if (((short *)(eobs))[3]) + { + if (((short *)eobs)[3] & 0xfefe) + idct_dequant_full_2x_neon (q, dq, dstv, stride); + else + idct_dequant_0_2x_neon (q, dq[0], dstv, stride); + } +} diff --git a/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm b/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm new file mode 100644 index 0000000000000000000000000000000000000000..6c29c55860d899a502bcd5aac71c5dfe459bd659 --- /dev/null +++ b/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm @@ -0,0 +1,79 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |idct_dequant_0_2x_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;void idct_dequant_0_2x_neon(short *q, short dq, +; unsigned char *dst, int stride); +; r0 *q +; r1 dq +; r2 *dst +; r3 stride +|idct_dequant_0_2x_neon| PROC + push {r4, r5} + + add r12, r2, #4 + vld1.32 {d2[0]}, [r2], r3 + vld1.32 {d8[0]}, [r12], r3 + vld1.32 {d2[1]}, [r2], r3 + vld1.32 {d8[1]}, [r12], r3 + vld1.32 {d4[0]}, [r2], r3 + vld1.32 {d10[0]}, [r12], r3 + vld1.32 {d4[1]}, [r2], r3 + vld1.32 {d10[1]}, [r12], r3 + + ldrh r12, [r0] ; lo q + ldrh r4, [r0, #32] ; hi q + mov r5, #0 + strh r5, [r0] + strh r5, [r0, #32] + + sxth r12, r12 ; lo + mul r0, r12, r1 + add r0, r0, #4 + asr r0, r0, #3 + vdup.16 q0, r0 + sxth r4, r4 ; hi + mul r0, r4, r1 + add r0, r0, #4 + asr r0, r0, #3 + vdup.16 q3, r0 + + vaddw.u8 q1, q0, d2 ; lo + vaddw.u8 q2, q0, d4 + vaddw.u8 q4, q3, d8 ; hi + vaddw.u8 q5, q3, d10 + + sub r2, r2, r3, lsl #2 ; dst - 4*stride + add r0, r2, #4 + + vqmovun.s16 d2, q1 ; lo + vqmovun.s16 d4, q2 + vqmovun.s16 d8, q4 ; hi + vqmovun.s16 d10, q5 + + vst1.32 {d2[0]}, [r2], r3 ; lo + vst1.32 {d8[0]}, [r0], r3 ; hi + vst1.32 {d2[1]}, [r2], r3 + vst1.32 {d8[1]}, [r0], r3 + vst1.32 {d4[0]}, [r2], r3 + vst1.32 {d10[0]}, [r0], r3 + vst1.32 {d4[1]}, [r2] + vst1.32 {d10[1]}, [r0] + + pop {r4, r5} + bx lr + + ENDP ; |idct_dequant_0_2x_neon| + END diff --git a/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm b/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm new file mode 100644 index 0000000000000000000000000000000000000000..d5dce63f6bd72b54d4a3d964a93dd485fccc06ed --- /dev/null +++ b/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm @@ -0,0 +1,196 @@ +; +; Copyright (c) 2010 The Webm project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |idct_dequant_full_2x_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;void idct_dequant_full_2x_neon(short *q, short *dq, +; unsigned char *dst, int stride); +; r0 *q, +; r1 *dq, +; r2 *dst +; r3 stride +|idct_dequant_full_2x_neon| PROC + vld1.16 {q0, q1}, [r1] ; dq (same l/r) + vld1.16 {q2, q3}, [r0] ; l q + add r0, r0, #32 + vld1.16 {q4, q5}, [r0] ; r q + add r12, r2, #4 + + ; interleave the predictors + vld1.32 {d28[0]}, [r2], r3 ; l pre + vld1.32 {d28[1]}, [r12], r3 ; r pre + vld1.32 {d29[0]}, [r2], r3 + vld1.32 {d29[1]}, [r12], r3 + vld1.32 {d30[0]}, [r2], r3 + vld1.32 {d30[1]}, [r12], r3 + vld1.32 {d31[0]}, [r2], r3 + vld1.32 {d31[1]}, [r12] + + adr r1, cospi8sqrt2minus1 ; pointer to the first constant + + ; dequant: q[i] = q[i] * dq[i] + vmul.i16 q2, q2, q0 + vmul.i16 q3, q3, q1 + vmul.i16 q4, q4, q0 + vmul.i16 q5, q5, q1 + + vld1.16 {d0}, [r1] + + ; q2: l0r0 q3: l8r8 + ; q4: l4r4 q5: l12r12 + vswp d5, d8 + vswp d7, d10 + + ; _CONSTANTS_ * 4,12 >> 16 + ; q6: 4 * sinpi : c1/temp1 + ; q7: 12 * sinpi : d1/temp2 + ; q8: 4 * cospi + ; q9: 12 * cospi + vqdmulh.s16 q6, q4, d0[2] ; sinpi8sqrt2 + vqdmulh.s16 q7, q5, d0[2] + vqdmulh.s16 q8, q4, d0[0] ; cospi8sqrt2minus1 + vqdmulh.s16 q9, q5, d0[0] + + vqadd.s16 q10, q2, q3 ; a1 = 0 + 8 + vqsub.s16 q11, q2, q3 ; b1 = 0 - 8 + + ; vqdmulh only accepts signed values. this was a problem because + ; our constant had the high bit set, and was treated as a negative value. + ; vqdmulh also doubles the value before it shifts by 16. we need to + ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0, + ; so we can shift the constant without losing precision. this avoids + ; shift again afterward, but also avoids the sign issue. win win! + ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we + ; pre-shift it + vshr.s16 q8, q8, #1 + vshr.s16 q9, q9, #1 + + ; q4: 4 + 4 * cospi : d1/temp1 + ; q5: 12 + 12 * cospi : c1/temp2 + vqadd.s16 q4, q4, q8 + vqadd.s16 q5, q5, q9 + + ; c1 = temp1 - temp2 + ; d1 = temp1 + temp2 + vqsub.s16 q2, q6, q5 + vqadd.s16 q3, q4, q7 + + ; [0]: a1+d1 + ; [1]: b1+c1 + ; [2]: b1-c1 + ; [3]: a1-d1 + vqadd.s16 q4, q10, q3 + vqadd.s16 q5, q11, q2 + vqsub.s16 q6, q11, q2 + vqsub.s16 q7, q10, q3 + + ; rotate + vtrn.32 q4, q6 + vtrn.32 q5, q7 + vtrn.16 q4, q5 + vtrn.16 q6, q7 + ; idct loop 2 + ; q4: l 0, 4, 8,12 r 0, 4, 8,12 + ; q5: l 1, 5, 9,13 r 1, 5, 9,13 + ; q6: l 2, 6,10,14 r 2, 6,10,14 + ; q7: l 3, 7,11,15 r 3, 7,11,15 + + ; q8: 1 * sinpi : c1/temp1 + ; q9: 3 * sinpi : d1/temp2 + ; q10: 1 * cospi + ; q11: 3 * cospi + vqdmulh.s16 q8, q5, d0[2] ; sinpi8sqrt2 + vqdmulh.s16 q9, q7, d0[2] + vqdmulh.s16 q10, q5, d0[0] ; cospi8sqrt2minus1 + vqdmulh.s16 q11, q7, d0[0] + + vqadd.s16 q2, q4, q6 ; a1 = 0 + 2 + vqsub.s16 q3, q4, q6 ; b1 = 0 - 2 + + ; see note on shifting above + vshr.s16 q10, q10, #1 + vshr.s16 q11, q11, #1 + + ; q10: 1 + 1 * cospi : d1/temp1 + ; q11: 3 + 3 * cospi : c1/temp2 + vqadd.s16 q10, q5, q10 + vqadd.s16 q11, q7, q11 + + ; q8: c1 = temp1 - temp2 + ; q9: d1 = temp1 + temp2 + vqsub.s16 q8, q8, q11 + vqadd.s16 q9, q10, q9 + + ; a1+d1 + ; b1+c1 + ; b1-c1 + ; a1-d1 + vqadd.s16 q4, q2, q9 + vqadd.s16 q5, q3, q8 + vqsub.s16 q6, q3, q8 + vqsub.s16 q7, q2, q9 + + ; +4 >> 3 (rounding) + vrshr.s16 q4, q4, #3 ; lo + vrshr.s16 q5, q5, #3 + vrshr.s16 q6, q6, #3 ; hi + vrshr.s16 q7, q7, #3 + + vtrn.32 q4, q6 + vtrn.32 q5, q7 + vtrn.16 q4, q5 + vtrn.16 q6, q7 + + ; adding pre + ; input is still packed. pre was read interleaved + vaddw.u8 q4, q4, d28 + vaddw.u8 q5, q5, d29 + vaddw.u8 q6, q6, d30 + vaddw.u8 q7, q7, d31 + + vmov.i16 q14, #0 + vmov q15, q14 + vst1.16 {q14, q15}, [r0] ; write over high input + sub r0, r0, #32 + vst1.16 {q14, q15}, [r0] ; write over low input + + sub r2, r2, r3, lsl #2 ; dst - 4*stride + add r1, r2, #4 ; hi + + ;saturate and narrow + vqmovun.s16 d0, q4 ; lo + vqmovun.s16 d1, q5 + vqmovun.s16 d2, q6 ; hi + vqmovun.s16 d3, q7 + + vst1.32 {d0[0]}, [r2], r3 ; lo + vst1.32 {d0[1]}, [r1], r3 ; hi + vst1.32 {d1[0]}, [r2], r3 + vst1.32 {d1[1]}, [r1], r3 + vst1.32 {d2[0]}, [r2], r3 + vst1.32 {d2[1]}, [r1], r3 + vst1.32 {d3[0]}, [r2] + vst1.32 {d3[1]}, [r1] + + bx lr + + ENDP ; |idct_dequant_full_2x_neon| + +; Constant Pool +cospi8sqrt2minus1 DCD 0x4e7b +; because the lowest bit in 0x8a8c is 0, we can pre-shift this +sinpi8sqrt2 DCD 0x4546 + + END diff --git a/vp8/common/arm/neon/iwalsh_neon.asm b/vp8/common/arm/neon/iwalsh_neon.asm new file mode 100644 index 0000000000000000000000000000000000000000..e8ea2a61976c8ff0c30a2150f8ec7a81ffea028b --- /dev/null +++ b/vp8/common/arm/neon/iwalsh_neon.asm @@ -0,0 +1,87 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + EXPORT |vp8_short_inv_walsh4x4_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE, READONLY ; name this block of code + +;short vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff) +|vp8_short_inv_walsh4x4_neon| PROC + + ; read in all four lines of values: d0->d3 + vld1.i16 {q0-q1}, [r0@128] + + ; first for loop + vadd.s16 d4, d0, d3 ;a = [0] + [12] + vadd.s16 d6, d1, d2 ;b = [4] + [8] + vsub.s16 d5, d0, d3 ;d = [0] - [12] + vsub.s16 d7, d1, d2 ;c = [4] - [8] + + vadd.s16 q0, q2, q3 ; a+b d+c + vsub.s16 q1, q2, q3 ; a-b d-c + + vtrn.32 d0, d2 ;d0: 0 1 8 9 + ;d2: 2 3 10 11 + vtrn.32 d1, d3 ;d1: 4 5 12 13 + ;d3: 6 7 14 15 + + vtrn.16 d0, d1 ;d0: 0 4 8 12 + ;d1: 1 5 9 13 + vtrn.16 d2, d3 ;d2: 2 6 10 14 + ;d3: 3 7 11 15 + + ; second for loop + + vadd.s16 d4, d0, d3 ;a = [0] + [3] + vadd.s16 d6, d1, d2 ;b = [1] + [2] + vsub.s16 d5, d0, d3 ;d = [0] - [3] + vsub.s16 d7, d1, d2 ;c = [1] - [2] + + vmov.i16 q8, #3 + + vadd.s16 q0, q2, q3 ; a+b d+c + vsub.s16 q1, q2, q3 ; a-b d-c + + vadd.i16 q0, q0, q8 ;e/f += 3 + vadd.i16 q1, q1, q8 ;g/h += 3 + + vshr.s16 q0, q0, #3 ;e/f >> 3 + vshr.s16 q1, q1, #3 ;g/h >> 3 + + mov r2, #64 + add r3, r1, #32 + + vst1.i16 d0[0], [r1],r2 + vst1.i16 d1[0], [r3],r2 + vst1.i16 d2[0], [r1],r2 + vst1.i16 d3[0], [r3],r2 + + vst1.i16 d0[1], [r1],r2 + vst1.i16 d1[1], [r3],r2 + vst1.i16 d2[1], [r1],r2 + vst1.i16 d3[1], [r3],r2 + + vst1.i16 d0[2], [r1],r2 + vst1.i16 d1[2], [r3],r2 + vst1.i16 d2[2], [r1],r2 + vst1.i16 d3[2], [r3],r2 + + vst1.i16 d0[3], [r1],r2 + vst1.i16 d1[3], [r3],r2 + vst1.i16 d2[3], [r1] + vst1.i16 d3[3], [r3] + + bx lr + ENDP ; |vp8_short_inv_walsh4x4_neon| + + END diff --git a/vp8/common/arm/neon/loopfilter_neon.asm b/vp8/common/arm/neon/loopfilter_neon.asm new file mode 100644 index 0000000000000000000000000000000000000000..e44be0a1e34d2199c20401aabc68315f2be2cb35 --- /dev/null +++ b/vp8/common/arm/neon/loopfilter_neon.asm @@ -0,0 +1,397 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_loop_filter_horizontal_edge_y_neon| + EXPORT |vp8_loop_filter_horizontal_edge_uv_neon| + EXPORT |vp8_loop_filter_vertical_edge_y_neon| + EXPORT |vp8_loop_filter_vertical_edge_uv_neon| + ARM + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src +; r1 int pitch +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, +|vp8_loop_filter_horizontal_edge_y_neon| PROC + push {lr} + vdup.u8 q0, r2 ; duplicate blimit + vdup.u8 q1, r3 ; duplicate limit + sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines + ldr r3, [sp, #4] ; load thresh + add r12, r2, r1 + add r1, r1, r1 + + vdup.u8 q2, r3 ; duplicate thresh + + vld1.u8 {q3}, [r2@128], r1 ; p3 + vld1.u8 {q4}, [r12@128], r1 ; p2 + vld1.u8 {q5}, [r2@128], r1 ; p1 + vld1.u8 {q6}, [r12@128], r1 ; p0 + vld1.u8 {q7}, [r2@128], r1 ; q0 + vld1.u8 {q8}, [r12@128], r1 ; q1 + vld1.u8 {q9}, [r2@128] ; q2 + vld1.u8 {q10}, [r12@128] ; q3 + + sub r2, r2, r1, lsl #1 + sub r12, r12, r1, lsl #1 + + bl vp8_loop_filter_neon + + vst1.u8 {q5}, [r2@128], r1 ; store op1 + vst1.u8 {q6}, [r12@128], r1 ; store op0 + vst1.u8 {q7}, [r2@128], r1 ; store oq0 + vst1.u8 {q8}, [r12@128], r1 ; store oq1 + + pop {pc} + ENDP ; |vp8_loop_filter_horizontal_edge_y_neon| + + +; r0 unsigned char *u, +; r1 int pitch, +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, +; sp+4 unsigned char *v +|vp8_loop_filter_horizontal_edge_uv_neon| PROC + push {lr} + vdup.u8 q0, r2 ; duplicate blimit + vdup.u8 q1, r3 ; duplicate limit + ldr r12, [sp, #4] ; load thresh + ldr r2, [sp, #8] ; load v ptr + vdup.u8 q2, r12 ; duplicate thresh + + sub r3, r0, r1, lsl #2 ; move u pointer down by 4 lines + sub r12, r2, r1, lsl #2 ; move v pointer down by 4 lines + + vld1.u8 {d6}, [r3@64], r1 ; p3 + vld1.u8 {d7}, [r12@64], r1 ; p3 + vld1.u8 {d8}, [r3@64], r1 ; p2 + vld1.u8 {d9}, [r12@64], r1 ; p2 + vld1.u8 {d10}, [r3@64], r1 ; p1 + vld1.u8 {d11}, [r12@64], r1 ; p1 + vld1.u8 {d12}, [r3@64], r1 ; p0 + vld1.u8 {d13}, [r12@64], r1 ; p0 + vld1.u8 {d14}, [r3@64], r1 ; q0 + vld1.u8 {d15}, [r12@64], r1 ; q0 + vld1.u8 {d16}, [r3@64], r1 ; q1 + vld1.u8 {d17}, [r12@64], r1 ; q1 + vld1.u8 {d18}, [r3@64], r1 ; q2 + vld1.u8 {d19}, [r12@64], r1 ; q2 + vld1.u8 {d20}, [r3@64] ; q3 + vld1.u8 {d21}, [r12@64] ; q3 + + bl vp8_loop_filter_neon + + sub r0, r0, r1, lsl #1 + sub r2, r2, r1, lsl #1 + + vst1.u8 {d10}, [r0@64], r1 ; store u op1 + vst1.u8 {d11}, [r2@64], r1 ; store v op1 + vst1.u8 {d12}, [r0@64], r1 ; store u op0 + vst1.u8 {d13}, [r2@64], r1 ; store v op0 + vst1.u8 {d14}, [r0@64], r1 ; store u oq0 + vst1.u8 {d15}, [r2@64], r1 ; store v oq0 + vst1.u8 {d16}, [r0@64] ; store u oq1 + vst1.u8 {d17}, [r2@64] ; store v oq1 + + pop {pc} + ENDP ; |vp8_loop_filter_horizontal_edge_uv_neon| + +; void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch, +; const signed char *flimit, +; const signed char *limit, +; const signed char *thresh, +; int count) +; r0 unsigned char *src +; r1 int pitch +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, + +|vp8_loop_filter_vertical_edge_y_neon| PROC + push {lr} + vdup.u8 q0, r2 ; duplicate blimit + vdup.u8 q1, r3 ; duplicate limit + sub r2, r0, #4 ; src ptr down by 4 columns + add r1, r1, r1 + ldr r3, [sp, #4] ; load thresh + add r12, r2, r1, asr #1 + + vld1.u8 {d6}, [r2], r1 + vld1.u8 {d8}, [r12], r1 + vld1.u8 {d10}, [r2], r1 + vld1.u8 {d12}, [r12], r1 + vld1.u8 {d14}, [r2], r1 + vld1.u8 {d16}, [r12], r1 + vld1.u8 {d18}, [r2], r1 + vld1.u8 {d20}, [r12], r1 + + vld1.u8 {d7}, [r2], r1 ; load second 8-line src data + vld1.u8 {d9}, [r12], r1 + vld1.u8 {d11}, [r2], r1 + vld1.u8 {d13}, [r12], r1 + vld1.u8 {d15}, [r2], r1 + vld1.u8 {d17}, [r12], r1 + vld1.u8 {d19}, [r2] + vld1.u8 {d21}, [r12] + + ;transpose to 8x16 matrix + vtrn.32 q3, q7 + vtrn.32 q4, q8 + vtrn.32 q5, q9 + vtrn.32 q6, q10 + + vdup.u8 q2, r3 ; duplicate thresh + + vtrn.16 q3, q5 + vtrn.16 q4, q6 + vtrn.16 q7, q9 + vtrn.16 q8, q10 + + vtrn.8 q3, q4 + vtrn.8 q5, q6 + vtrn.8 q7, q8 + vtrn.8 q9, q10 + + bl vp8_loop_filter_neon + + vswp d12, d11 + vswp d16, d13 + + sub r0, r0, #2 ; dst ptr + + vswp d14, d12 + vswp d16, d15 + + add r12, r0, r1, asr #1 + + ;store op1, op0, oq0, oq1 + vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 + vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r12], r1 + vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1 + vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r12], r1 + vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1 + vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r12], r1 + vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1 + vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r12], r1 + + vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r0], r1 + vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r12], r1 + vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r0], r1 + vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r12], r1 + vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r0], r1 + vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r12], r1 + vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0] + vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r12] + + pop {pc} + ENDP ; |vp8_loop_filter_vertical_edge_y_neon| + +; void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch +; const signed char *flimit, +; const signed char *limit, +; const signed char *thresh, +; unsigned char *v) +; r0 unsigned char *u, +; r1 int pitch, +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, +; sp+4 unsigned char *v +|vp8_loop_filter_vertical_edge_uv_neon| PROC + push {lr} + vdup.u8 q0, r2 ; duplicate blimit + sub r12, r0, #4 ; move u pointer down by 4 columns + ldr r2, [sp, #8] ; load v ptr + vdup.u8 q1, r3 ; duplicate limit + sub r3, r2, #4 ; move v pointer down by 4 columns + + vld1.u8 {d6}, [r12], r1 ;load u data + vld1.u8 {d7}, [r3], r1 ;load v data + vld1.u8 {d8}, [r12], r1 + vld1.u8 {d9}, [r3], r1 + vld1.u8 {d10}, [r12], r1 + vld1.u8 {d11}, [r3], r1 + vld1.u8 {d12}, [r12], r1 + vld1.u8 {d13}, [r3], r1 + vld1.u8 {d14}, [r12], r1 + vld1.u8 {d15}, [r3], r1 + vld1.u8 {d16}, [r12], r1 + vld1.u8 {d17}, [r3], r1 + vld1.u8 {d18}, [r12], r1 + vld1.u8 {d19}, [r3], r1 + vld1.u8 {d20}, [r12] + vld1.u8 {d21}, [r3] + + ldr r12, [sp, #4] ; load thresh + + ;transpose to 8x16 matrix + vtrn.32 q3, q7 + vtrn.32 q4, q8 + vtrn.32 q5, q9 + vtrn.32 q6, q10 + + vdup.u8 q2, r12 ; duplicate thresh + + vtrn.16 q3, q5 + vtrn.16 q4, q6 + vtrn.16 q7, q9 + vtrn.16 q8, q10 + + vtrn.8 q3, q4 + vtrn.8 q5, q6 + vtrn.8 q7, q8 + vtrn.8 q9, q10 + + bl vp8_loop_filter_neon + + vswp d12, d11 + vswp d16, d13 + vswp d14, d12 + vswp d16, d15 + + sub r0, r0, #2 + sub r2, r2, #2 + + ;store op1, op0, oq0, oq1 + vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 + vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r2], r1 + vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1 + vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r2], r1 + vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1 + vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r2], r1 + vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1 + vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r2], r1 + vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1 + vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r2], r1 + vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1 + vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r2], r1 + vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1 + vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r2], r1 + vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0] + vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2] + + pop {pc} + ENDP ; |vp8_loop_filter_vertical_edge_uv_neon| + +; void vp8_loop_filter_neon(); +; This is a helper function for the loopfilters. The invidual functions do the +; necessary load, transpose (if necessary) and store. + +; r0-r3 PRESERVE +; q0 flimit +; q1 limit +; q2 thresh +; q3 p3 +; q4 p2 +; q5 p1 +; q6 p0 +; q7 q0 +; q8 q1 +; q9 q2 +; q10 q3 +|vp8_loop_filter_neon| PROC + + ; vp8_filter_mask + vabd.u8 q11, q3, q4 ; abs(p3 - p2) + vabd.u8 q12, q4, q5 ; abs(p2 - p1) + vabd.u8 q13, q5, q6 ; abs(p1 - p0) + vabd.u8 q14, q8, q7 ; abs(q1 - q0) + vabd.u8 q3, q9, q8 ; abs(q2 - q1) + vabd.u8 q4, q10, q9 ; abs(q3 - q2) + + vmax.u8 q11, q11, q12 + vmax.u8 q12, q13, q14 + vmax.u8 q3, q3, q4 + vmax.u8 q15, q11, q12 + + vabd.u8 q9, q6, q7 ; abs(p0 - q0) + + ; vp8_hevmask + vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 + vmax.u8 q15, q15, q3 + + vmov.u8 q10, #0x80 ; 0x80 + + vabd.u8 q2, q5, q8 ; a = abs(p1 - q1) + vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2 + + vcge.u8 q15, q1, q15 + + ; vp8_filter() function + ; convert to signed + veor q7, q7, q10 ; qs0 + vshr.u8 q2, q2, #1 ; a = a / 2 + veor q6, q6, q10 ; ps0 + + veor q5, q5, q10 ; ps1 + vqadd.u8 q9, q9, q2 ; a = b + a + + veor q8, q8, q10 ; qs1 + + vmov.u8 q10, #3 ; #3 + + vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) + vsubl.s8 q11, d15, d13 + + vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1 + + vmovl.u8 q4, d20 + + vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1) + vorr q14, q13, q14 ; vp8_hevmask + + vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0) + vmul.i16 q11, q11, q4 + + vand q1, q1, q14 ; vp8_filter &= hev + vand q15, q15, q9 ; vp8_filter_mask + + vaddw.s8 q2, q2, d2 + vaddw.s8 q11, q11, d3 + + vmov.u8 q9, #4 ; #4 + + ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d2, q2 + vqmovn.s16 d3, q11 + vand q1, q1, q15 ; vp8_filter &= mask + + vqadd.s8 q2, q1, q10 ; Filter2 = clamp(vp8_filter+3) + vqadd.s8 q1, q1, q9 ; Filter1 = clamp(vp8_filter+4) + vshr.s8 q2, q2, #3 ; Filter2 >>= 3 + vshr.s8 q1, q1, #3 ; Filter1 >>= 3 + + + vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + Filter2) + vqsub.s8 q10, q7, q1 ; u = clamp(qs0 - Filter1) + + ; outer tap adjustments: ++vp8_filter >> 1 + vrshr.s8 q1, q1, #1 + vbic q1, q1, q14 ; vp8_filter &= ~hev + vmov.u8 q0, #0x80 ; 0x80 + vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + vp8_filter) + vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - vp8_filter) + + veor q6, q11, q0 ; *op0 = u^0x80 + veor q7, q10, q0 ; *oq0 = u^0x80 + veor q5, q13, q0 ; *op1 = u^0x80 + veor q8, q12, q0 ; *oq1 = u^0x80 + + bx lr + ENDP ; |vp8_loop_filter_horizontal_edge_y_neon| + +;----------------- + + END diff --git a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm new file mode 100644 index 0000000000000000000000000000000000000000..adf848b9c347966ecd5205b8f9a8f0a4cd46f9c2 --- /dev/null +++ b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm @@ -0,0 +1,117 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + ;EXPORT |vp8_loop_filter_simple_horizontal_edge_neon| + EXPORT |vp8_loop_filter_bhs_neon| + EXPORT |vp8_loop_filter_mbhs_neon| + ARM + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *s, PRESERVE +; r1 int p, PRESERVE +; q1 limit, PRESERVE + +|vp8_loop_filter_simple_horizontal_edge_neon| PROC + + sub r3, r0, r1, lsl #1 ; move src pointer down by 2 lines + + vld1.u8 {q7}, [r0@128], r1 ; q0 + vld1.u8 {q5}, [r3@128], r1 ; p0 + vld1.u8 {q8}, [r0@128] ; q1 + vld1.u8 {q6}, [r3@128] ; p1 + + vabd.u8 q15, q6, q7 ; abs(p0 - q0) + vabd.u8 q14, q5, q8 ; abs(p1 - q1) + + vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2 + vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2 + vmov.u8 q0, #0x80 ; 0x80 + vmov.s16 q13, #3 + vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 + + veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value + veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value + veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value + veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value + + vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1 + + vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) + vsubl.s8 q3, d15, d13 + + vqsub.s8 q4, q5, q8 ; q4: vp8_filter = vp8_signed_char_clamp(ps1-qs1) + + vmul.s16 q2, q2, q13 ; 3 * ( qs0 - ps0) + vmul.s16 q3, q3, q13 + + vmov.u8 q10, #0x03 ; 0x03 + vmov.u8 q9, #0x04 ; 0x04 + + vaddw.s8 q2, q2, d8 ; vp8_filter + 3 * ( qs0 - ps0) + vaddw.s8 q3, q3, d9 + + vqmovn.s16 d8, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d9, q3 + + vand q14, q4, q15 ; vp8_filter &= mask + + vqadd.s8 q2, q14, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) + vqadd.s8 q3, q14, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) + vshr.s8 q2, q2, #3 ; Filter2 >>= 3 + vshr.s8 q4, q3, #3 ; Filter1 >>= 3 + + sub r0, r0, r1 + + ;calculate output + vqadd.s8 q11, q6, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2) + vqsub.s8 q10, q7, q4 ; u = vp8_signed_char_clamp(qs0 - Filter1) + + veor q6, q11, q0 ; *op0 = u^0x80 + veor q7, q10, q0 ; *oq0 = u^0x80 + + vst1.u8 {q6}, [r3@128] ; store op0 + vst1.u8 {q7}, [r0@128] ; store oq0 + + bx lr + ENDP ; |vp8_loop_filter_simple_horizontal_edge_neon| + +; r0 unsigned char *y +; r1 int ystride +; r2 const unsigned char *blimit + +|vp8_loop_filter_bhs_neon| PROC + push {r4, lr} + ldrb r3, [r2] ; load blim from mem + vdup.s8 q1, r3 ; duplicate blim + + add r0, r0, r1, lsl #2 ; src = y_ptr + 4 * y_stride + bl vp8_loop_filter_simple_horizontal_edge_neon + ; vp8_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1 + add r0, r0, r1, lsl #2 ; src = y_ptr + 8* y_stride + bl vp8_loop_filter_simple_horizontal_edge_neon + add r0, r0, r1, lsl #2 ; src = y_ptr + 12 * y_stride + pop {r4, lr} + b vp8_loop_filter_simple_horizontal_edge_neon + ENDP ;|vp8_loop_filter_bhs_neon| + +; r0 unsigned char *y +; r1 int ystride +; r2 const unsigned char *blimit + +|vp8_loop_filter_mbhs_neon| PROC + ldrb r3, [r2] ; load blim from mem + vdup.s8 q1, r3 ; duplicate mblim + b vp8_loop_filter_simple_horizontal_edge_neon + ENDP ;|vp8_loop_filter_bhs_neon| + + END diff --git a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm new file mode 100644 index 0000000000000000000000000000000000000000..e690df2f7de9d8e3e9cd502f78c24fd70c5c6241 --- /dev/null +++ b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm @@ -0,0 +1,154 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + ;EXPORT |vp8_loop_filter_simple_vertical_edge_neon| + EXPORT |vp8_loop_filter_bvs_neon| + EXPORT |vp8_loop_filter_mbvs_neon| + ARM + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *s, PRESERVE +; r1 int p, PRESERVE +; q1 limit, PRESERVE + +|vp8_loop_filter_simple_vertical_edge_neon| PROC + sub r0, r0, #2 ; move src pointer down by 2 columns + add r12, r1, r1 + add r3, r0, r1 + + vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r12 + vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r3], r12 + vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r12 + vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r3], r12 + vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r12 + vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r3], r12 + vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r12 + vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r3], r12 + + vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r12 + vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r3], r12 + vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r12 + vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r3], r12 + vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r12 + vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r3], r12 + vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r12 + vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r3] + + vswp d7, d10 + vswp d12, d9 + + ;vp8_filter_mask() function + ;vp8_hevmask() function + sub r0, r0, r1, lsl #4 + vabd.u8 q15, q5, q4 ; abs(p0 - q0) + vabd.u8 q14, q3, q6 ; abs(p1 - q1) + + vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2 + vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2 + vmov.u8 q0, #0x80 ; 0x80 + vmov.s16 q11, #3 + vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 + + veor q4, q4, q0 ; qs0: q0 offset to convert to a signed value + veor q5, q5, q0 ; ps0: p0 offset to convert to a signed value + veor q3, q3, q0 ; ps1: p1 offset to convert to a signed value + veor q6, q6, q0 ; qs1: q1 offset to convert to a signed value + + vcge.u8 q15, q1, q15 ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1 + + vsubl.s8 q2, d8, d10 ; ( qs0 - ps0) + vsubl.s8 q13, d9, d11 + + vqsub.s8 q14, q3, q6 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1) + + vmul.s16 q2, q2, q11 ; 3 * ( qs0 - ps0) + vmul.s16 q13, q13, q11 + + vmov.u8 q11, #0x03 ; 0x03 + vmov.u8 q12, #0x04 ; 0x04 + + vaddw.s8 q2, q2, d28 ; vp8_filter + 3 * ( qs0 - ps0) + vaddw.s8 q13, q13, d29 + + vqmovn.s16 d28, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d29, q13 + + add r0, r0, #1 + add r3, r0, r1 + + vand q14, q14, q15 ; vp8_filter &= mask + + vqadd.s8 q2, q14, q11 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) + vqadd.s8 q3, q14, q12 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) + vshr.s8 q2, q2, #3 ; Filter2 >>= 3 + vshr.s8 q14, q3, #3 ; Filter1 >>= 3 + + ;calculate output + vqadd.s8 q11, q5, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2) + vqsub.s8 q10, q4, q14 ; u = vp8_signed_char_clamp(qs0 - Filter1) + + veor q6, q11, q0 ; *op0 = u^0x80 + veor q7, q10, q0 ; *oq0 = u^0x80 + add r12, r1, r1 + vswp d13, d14 + + ;store op1, op0, oq0, oq1 + vst2.8 {d12[0], d13[0]}, [r0], r12 + vst2.8 {d12[1], d13[1]}, [r3], r12 + vst2.8 {d12[2], d13[2]}, [r0], r12 + vst2.8 {d12[3], d13[3]}, [r3], r12 + vst2.8 {d12[4], d13[4]}, [r0], r12 + vst2.8 {d12[5], d13[5]}, [r3], r12 + vst2.8 {d12[6], d13[6]}, [r0], r12 + vst2.8 {d12[7], d13[7]}, [r3], r12 + vst2.8 {d14[0], d15[0]}, [r0], r12 + vst2.8 {d14[1], d15[1]}, [r3], r12 + vst2.8 {d14[2], d15[2]}, [r0], r12 + vst2.8 {d14[3], d15[3]}, [r3], r12 + vst2.8 {d14[4], d15[4]}, [r0], r12 + vst2.8 {d14[5], d15[5]}, [r3], r12 + vst2.8 {d14[6], d15[6]}, [r0], r12 + vst2.8 {d14[7], d15[7]}, [r3] + + bx lr + ENDP ; |vp8_loop_filter_simple_vertical_edge_neon| + +; r0 unsigned char *y +; r1 int ystride +; r2 const unsigned char *blimit + +|vp8_loop_filter_bvs_neon| PROC + push {r4, lr} + ldrb r3, [r2] ; load blim from mem + mov r4, r0 + add r0, r0, #4 + vdup.s8 q1, r3 ; duplicate blim + bl vp8_loop_filter_simple_vertical_edge_neon + ; vp8_loop_filter_simple_vertical_edge_neon preserves r1 and q1 + add r0, r4, #8 + bl vp8_loop_filter_simple_vertical_edge_neon + add r0, r4, #12 + pop {r4, lr} + b vp8_loop_filter_simple_vertical_edge_neon + ENDP ;|vp8_loop_filter_bvs_neon| + +; r0 unsigned char *y +; r1 int ystride +; r2 const unsigned char *blimit + +|vp8_loop_filter_mbvs_neon| PROC + ldrb r3, [r2] ; load mblim from mem + vdup.s8 q1, r3 ; duplicate mblim + b vp8_loop_filter_simple_vertical_edge_neon + ENDP ;|vp8_loop_filter_bvs_neon| + END diff --git a/vp8/common/arm/neon/mbloopfilter_neon.asm b/vp8/common/arm/neon/mbloopfilter_neon.asm new file mode 100644 index 0000000000000000000000000000000000000000..f41c156df8b27783c36ef81ba0f1cada5f666e2c --- /dev/null +++ b/vp8/common/arm/neon/mbloopfilter_neon.asm @@ -0,0 +1,469 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_mbloop_filter_horizontal_edge_y_neon| + EXPORT |vp8_mbloop_filter_horizontal_edge_uv_neon| + EXPORT |vp8_mbloop_filter_vertical_edge_y_neon| + EXPORT |vp8_mbloop_filter_vertical_edge_uv_neon| + ARM + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch, +; const unsigned char *blimit, +; const unsigned char *limit, +; const unsigned char *thresh) +; r0 unsigned char *src, +; r1 int pitch, +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, +|vp8_mbloop_filter_horizontal_edge_y_neon| PROC + push {lr} + add r1, r1, r1 ; double stride + ldr r12, [sp, #4] ; load thresh + sub r0, r0, r1, lsl #1 ; move src pointer down by 4 lines + vdup.u8 q2, r12 ; thresh + add r12, r0, r1, lsr #1 ; move src pointer up by 1 line + + vld1.u8 {q3}, [r0@128], r1 ; p3 + vld1.u8 {q4}, [r12@128], r1 ; p2 + vld1.u8 {q5}, [r0@128], r1 ; p1 + vld1.u8 {q6}, [r12@128], r1 ; p0 + vld1.u8 {q7}, [r0@128], r1 ; q0 + vld1.u8 {q8}, [r12@128], r1 ; q1 + vld1.u8 {q9}, [r0@128], r1 ; q2 + vld1.u8 {q10}, [r12@128], r1 ; q3 + + bl vp8_mbloop_filter_neon + + sub r12, r12, r1, lsl #2 + add r0, r12, r1, lsr #1 + + vst1.u8 {q4}, [r12@128],r1 ; store op2 + vst1.u8 {q5}, [r0@128],r1 ; store op1 + vst1.u8 {q6}, [r12@128], r1 ; store op0 + vst1.u8 {q7}, [r0@128],r1 ; store oq0 + vst1.u8 {q8}, [r12@128] ; store oq1 + vst1.u8 {q9}, [r0@128] ; store oq2 + + pop {pc} + ENDP ; |vp8_mbloop_filter_horizontal_edge_y_neon| + +; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch, +; const unsigned char *blimit, +; const unsigned char *limit, +; const unsigned char *thresh, +; unsigned char *v) +; r0 unsigned char *u, +; r1 int pitch, +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, +; sp+4 unsigned char *v + +|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC + push {lr} + ldr r12, [sp, #4] ; load thresh + sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines + vdup.u8 q2, r12 ; thresh + ldr r12, [sp, #8] ; load v ptr + sub r12, r12, r1, lsl #2 ; move v pointer down by 4 lines + + vld1.u8 {d6}, [r0@64], r1 ; p3 + vld1.u8 {d7}, [r12@64], r1 ; p3 + vld1.u8 {d8}, [r0@64], r1 ; p2 + vld1.u8 {d9}, [r12@64], r1 ; p2 + vld1.u8 {d10}, [r0@64], r1 ; p1 + vld1.u8 {d11}, [r12@64], r1 ; p1 + vld1.u8 {d12}, [r0@64], r1 ; p0 + vld1.u8 {d13}, [r12@64], r1 ; p0 + vld1.u8 {d14}, [r0@64], r1 ; q0 + vld1.u8 {d15}, [r12@64], r1 ; q0 + vld1.u8 {d16}, [r0@64], r1 ; q1 + vld1.u8 {d17}, [r12@64], r1 ; q1 + vld1.u8 {d18}, [r0@64], r1 ; q2 + vld1.u8 {d19}, [r12@64], r1 ; q2 + vld1.u8 {d20}, [r0@64], r1 ; q3 + vld1.u8 {d21}, [r12@64], r1 ; q3 + + bl vp8_mbloop_filter_neon + + sub r0, r0, r1, lsl #3 + sub r12, r12, r1, lsl #3 + + add r0, r0, r1 + add r12, r12, r1 + + vst1.u8 {d8}, [r0@64], r1 ; store u op2 + vst1.u8 {d9}, [r12@64], r1 ; store v op2 + vst1.u8 {d10}, [r0@64], r1 ; store u op1 + vst1.u8 {d11}, [r12@64], r1 ; store v op1 + vst1.u8 {d12}, [r0@64], r1 ; store u op0 + vst1.u8 {d13}, [r12@64], r1 ; store v op0 + vst1.u8 {d14}, [r0@64], r1 ; store u oq0 + vst1.u8 {d15}, [r12@64], r1 ; store v oq0 + vst1.u8 {d16}, [r0@64], r1 ; store u oq1 + vst1.u8 {d17}, [r12@64], r1 ; store v oq1 + vst1.u8 {d18}, [r0@64], r1 ; store u oq2 + vst1.u8 {d19}, [r12@64], r1 ; store v oq2 + + pop {pc} + ENDP ; |vp8_mbloop_filter_horizontal_edge_uv_neon| + +; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch, +; const unsigned char *blimit, +; const unsigned char *limit, +; const unsigned char *thresh) +; r0 unsigned char *src, +; r1 int pitch, +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, +|vp8_mbloop_filter_vertical_edge_y_neon| PROC + push {lr} + ldr r12, [sp, #4] ; load thresh + sub r0, r0, #4 ; move src pointer down by 4 columns + vdup.s8 q2, r12 ; thresh + add r12, r0, r1, lsl #3 ; move src pointer down by 8 lines + + vld1.u8 {d6}, [r0], r1 ; load first 8-line src data + vld1.u8 {d7}, [r12], r1 ; load second 8-line src data + vld1.u8 {d8}, [r0], r1 + vld1.u8 {d9}, [r12], r1 + vld1.u8 {d10}, [r0], r1 + vld1.u8 {d11}, [r12], r1 + vld1.u8 {d12}, [r0], r1 + vld1.u8 {d13}, [r12], r1 + vld1.u8 {d14}, [r0], r1 + vld1.u8 {d15}, [r12], r1 + vld1.u8 {d16}, [r0], r1 + vld1.u8 {d17}, [r12], r1 + vld1.u8 {d18}, [r0], r1 + vld1.u8 {d19}, [r12], r1 + vld1.u8 {d20}, [r0], r1 + vld1.u8 {d21}, [r12], r1 + + ;transpose to 8x16 matrix + vtrn.32 q3, q7 + vtrn.32 q4, q8 + vtrn.32 q5, q9 + vtrn.32 q6, q10 + + vtrn.16 q3, q5 + vtrn.16 q4, q6 + vtrn.16 q7, q9 + vtrn.16 q8, q10 + + vtrn.8 q3, q4 + vtrn.8 q5, q6 + vtrn.8 q7, q8 + vtrn.8 q9, q10 + + sub r0, r0, r1, lsl #3 + + bl vp8_mbloop_filter_neon + + sub r12, r12, r1, lsl #3 + + ;transpose to 16x8 matrix + vtrn.32 q3, q7 + vtrn.32 q4, q8 + vtrn.32 q5, q9 + vtrn.32 q6, q10 + + vtrn.16 q3, q5 + vtrn.16 q4, q6 + vtrn.16 q7, q9 + vtrn.16 q8, q10 + + vtrn.8 q3, q4 + vtrn.8 q5, q6 + vtrn.8 q7, q8 + vtrn.8 q9, q10 + + ;store op2, op1, op0, oq0, oq1, oq2 + vst1.8 {d6}, [r0], r1 + vst1.8 {d7}, [r12], r1 + vst1.8 {d8}, [r0], r1 + vst1.8 {d9}, [r12], r1 + vst1.8 {d10}, [r0], r1 + vst1.8 {d11}, [r12], r1 + vst1.8 {d12}, [r0], r1 + vst1.8 {d13}, [r12], r1 + vst1.8 {d14}, [r0], r1 + vst1.8 {d15}, [r12], r1 + vst1.8 {d16}, [r0], r1 + vst1.8 {d17}, [r12], r1 + vst1.8 {d18}, [r0], r1 + vst1.8 {d19}, [r12], r1 + vst1.8 {d20}, [r0] + vst1.8 {d21}, [r12] + + pop {pc} + ENDP ; |vp8_mbloop_filter_vertical_edge_y_neon| + +; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch, +; const unsigned char *blimit, +; const unsigned char *limit, +; const unsigned char *thresh, +; unsigned char *v) +; r0 unsigned char *u, +; r1 int pitch, +; r2 const signed char *flimit, +; r3 const signed char *limit, +; sp const signed char *thresh, +; sp+4 unsigned char *v +|vp8_mbloop_filter_vertical_edge_uv_neon| PROC + push {lr} + ldr r12, [sp, #4] ; load thresh + sub r0, r0, #4 ; move u pointer down by 4 columns + vdup.u8 q2, r12 ; thresh + ldr r12, [sp, #8] ; load v ptr + sub r12, r12, #4 ; move v pointer down by 4 columns + + vld1.u8 {d6}, [r0], r1 ;load u data + vld1.u8 {d7}, [r12], r1 ;load v data + vld1.u8 {d8}, [r0], r1 + vld1.u8 {d9}, [r12], r1 + vld1.u8 {d10}, [r0], r1 + vld1.u8 {d11}, [r12], r1 + vld1.u8 {d12}, [r0], r1 + vld1.u8 {d13}, [r12], r1 + vld1.u8 {d14}, [r0], r1 + vld1.u8 {d15}, [r12], r1 + vld1.u8 {d16}, [r0], r1 + vld1.u8 {d17}, [r12], r1 + vld1.u8 {d18}, [r0], r1 + vld1.u8 {d19}, [r12], r1 + vld1.u8 {d20}, [r0], r1 + vld1.u8 {d21}, [r12], r1 + + ;transpose to 8x16 matrix + vtrn.32 q3, q7 + vtrn.32 q4, q8 + vtrn.32 q5, q9 + vtrn.32 q6, q10 + + vtrn.16 q3, q5 + vtrn.16 q4, q6 + vtrn.16 q7, q9 + vtrn.16 q8, q10 + + vtrn.8 q3, q4 + vtrn.8 q5, q6 + vtrn.8 q7, q8 + vtrn.8 q9, q10 + + sub r0, r0, r1, lsl #3 + + bl vp8_mbloop_filter_neon + + sub r12, r12, r1, lsl #3 + + ;transpose to 16x8 matrix + vtrn.32 q3, q7 + vtrn.32 q4, q8 + vtrn.32 q5, q9 + vtrn.32 q6, q10 + + vtrn.16 q3, q5 + vtrn.16 q4, q6 + vtrn.16 q7, q9 + vtrn.16 q8, q10 + + vtrn.8 q3, q4 + vtrn.8 q5, q6 + vtrn.8 q7, q8 + vtrn.8 q9, q10 + + ;store op2, op1, op0, oq0, oq1, oq2 + vst1.8 {d6}, [r0], r1 + vst1.8 {d7}, [r12], r1 + vst1.8 {d8}, [r0], r1 + vst1.8 {d9}, [r12], r1 + vst1.8 {d10}, [r0], r1 + vst1.8 {d11}, [r12], r1 + vst1.8 {d12}, [r0], r1 + vst1.8 {d13}, [r12], r1 + vst1.8 {d14}, [r0], r1 + vst1.8 {d15}, [r12], r1 + vst1.8 {d16}, [r0], r1 + vst1.8 {d17}, [r12], r1 + vst1.8 {d18}, [r0], r1 + vst1.8 {d19}, [r12], r1 + vst1.8 {d20}, [r0] + vst1.8 {d21}, [r12] + + pop {pc} + ENDP ; |vp8_mbloop_filter_vertical_edge_uv_neon| + +; void vp8_mbloop_filter_neon() +; This is a helper function for the macroblock loopfilters. The individual +; functions do the necessary load, transpose (if necessary), preserve (if +; necessary) and store. + +; r0,r1 PRESERVE +; r2 mblimit +; r3 limit + +; q2 thresh +; q3 p3 PRESERVE +; q4 p2 +; q5 p1 +; q6 p0 +; q7 q0 +; q8 q1 +; q9 q2 +; q10 q3 PRESERVE + +|vp8_mbloop_filter_neon| PROC + + ; vp8_filter_mask + vabd.u8 q11, q3, q4 ; abs(p3 - p2) + vabd.u8 q12, q4, q5 ; abs(p2 - p1) + vabd.u8 q13, q5, q6 ; abs(p1 - p0) + vabd.u8 q14, q8, q7 ; abs(q1 - q0) + vabd.u8 q1, q9, q8 ; abs(q2 - q1) + vabd.u8 q0, q10, q9 ; abs(q3 - q2) + + vmax.u8 q11, q11, q12 + vmax.u8 q12, q13, q14 + vmax.u8 q1, q1, q0 + vmax.u8 q15, q11, q12 + + vabd.u8 q12, q6, q7 ; abs(p0 - q0) + + ; vp8_hevmask + vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh) * -1 + vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh) * -1 + vmax.u8 q15, q15, q1 + + vdup.u8 q1, r3 ; limit + vdup.u8 q2, r2 ; mblimit + + vmov.u8 q0, #0x80 ; 0x80 + + vcge.u8 q15, q1, q15 + + vabd.u8 q1, q5, q8 ; a = abs(p1 - q1) + vqadd.u8 q12, q12, q12 ; b = abs(p0 - q0) * 2 + vmov.u16 q11, #3 ; #3 + + ; vp8_filter + ; convert to signed + veor q7, q7, q0 ; qs0 + vshr.u8 q1, q1, #1 ; a = a / 2 + veor q6, q6, q0 ; ps0 + veor q5, q5, q0 ; ps1 + + vqadd.u8 q12, q12, q1 ; a = b + a + + veor q8, q8, q0 ; qs1 + veor q4, q4, q0 ; ps2 + veor q9, q9, q0 ; qs2 + + vorr q14, q13, q14 ; vp8_hevmask + + vcge.u8 q12, q2, q12 ; (a > flimit * 2 + limit) * -1 + + vsubl.s8 q2, d14, d12 ; qs0 - ps0 + vsubl.s8 q13, d15, d13 + + vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1) + + vmul.i16 q2, q2, q11 ; 3 * ( qs0 - ps0) + + vand q15, q15, q12 ; vp8_filter_mask + + vmul.i16 q13, q13, q11 + + vmov.u8 q12, #3 ; #3 + + vaddw.s8 q2, q2, d2 ; vp8_filter + 3 * ( qs0 - ps0) + vaddw.s8 q13, q13, d3 + + vmov.u8 q11, #4 ; #4 + + ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d2, q2 + vqmovn.s16 d3, q13 + + vand q1, q1, q15 ; vp8_filter &= mask + + vmov.u16 q15, #63 ; #63 + + vand q13, q1, q14 ; Filter2 &= hev + + vqadd.s8 q2, q13, q11 ; Filter1 = clamp(Filter2+4) + vqadd.s8 q13, q13, q12 ; Filter2 = clamp(Filter2+3) + + vmov q0, q15 + + vshr.s8 q2, q2, #3 ; Filter1 >>= 3 + vshr.s8 q13, q13, #3 ; Filter2 >>= 3 + + vmov q11, q15 + vmov q12, q15 + + vqsub.s8 q7, q7, q2 ; qs0 = clamp(qs0 - Filter1) + + vqadd.s8 q6, q6, q13 ; ps0 = clamp(ps0 + Filter2) + + vbic q1, q1, q14 ; vp8_filter &= ~hev + + ; roughly 1/7th difference across boundary + ; roughly 2/7th difference across boundary + ; roughly 3/7th difference across boundary + + vmov.u8 d5, #9 ; #9 + vmov.u8 d4, #18 ; #18 + + vmov q13, q15 + vmov q14, q15 + + vmlal.s8 q0, d2, d5 ; 63 + Filter2 * 9 + vmlal.s8 q11, d3, d5 + vmov.u8 d5, #27 ; #27 + vmlal.s8 q12, d2, d4 ; 63 + Filter2 * 18 + vmlal.s8 q13, d3, d4 + vmlal.s8 q14, d2, d5 ; 63 + Filter2 * 27 + vmlal.s8 q15, d3, d5 + + vqshrn.s16 d0, q0, #7 ; u = clamp((63 + Filter2 * 9)>>7) + vqshrn.s16 d1, q11, #7 + vqshrn.s16 d24, q12, #7 ; u = clamp((63 + Filter2 * 18)>>7) + vqshrn.s16 d25, q13, #7 + vqshrn.s16 d28, q14, #7 ; u = clamp((63 + Filter2 * 27)>>7) + vqshrn.s16 d29, q15, #7 + + vmov.u8 q1, #0x80 ; 0x80 + + vqsub.s8 q11, q9, q0 ; s = clamp(qs2 - u) + vqadd.s8 q0, q4, q0 ; s = clamp(ps2 + u) + vqsub.s8 q13, q8, q12 ; s = clamp(qs1 - u) + vqadd.s8 q12, q5, q12 ; s = clamp(ps1 + u) + vqsub.s8 q15, q7, q14 ; s = clamp(qs0 - u) + vqadd.s8 q14, q6, q14 ; s = clamp(ps0 + u) + + veor q9, q11, q1 ; *oq2 = s^0x80 + veor q4, q0, q1 ; *op2 = s^0x80 + veor q8, q13, q1 ; *oq1 = s^0x80 + veor q5, q12, q1 ; *op2 = s^0x80 + veor q7, q15, q1 ; *oq0 = s^0x80 + veor q6, q14, q1 ; *op0 = s^0x80 + + bx lr + ENDP ; |vp8_mbloop_filter_neon| + +;----------------- + + END diff --git a/vp8/common/arm/neon/sad16_neon.asm b/vp8/common/arm/neon/sad16_neon.asm new file mode 100644 index 0000000000000000000000000000000000000000..d7c590e15a21fa33a70b78b684e1252fd447c937 --- /dev/null +++ b/vp8/common/arm/neon/sad16_neon.asm @@ -0,0 +1,207 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_sad16x16_neon| + EXPORT |vp8_sad16x8_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int src_stride +; r2 unsigned char *ref_ptr +; r3 int ref_stride +|vp8_sad16x16_neon| PROC +;; + vld1.8 {q0}, [r0], r1 + vld1.8 {q4}, [r2], r3 + + vld1.8 {q1}, [r0], r1 + vld1.8 {q5}, [r2], r3 + + vabdl.u8 q12, d0, d8 + vabdl.u8 q13, d1, d9 + + vld1.8 {q2}, [r0], r1 + vld1.8 {q6}, [r2], r3 + + vabal.u8 q12, d2, d10 + vabal.u8 q13, d3, d11 + + vld1.8 {q3}, [r0], r1 + vld1.8 {q7}, [r2], r3 + + vabal.u8 q12, d4, d12 + vabal.u8 q13, d5, d13 + +;; + vld1.8 {q0}, [r0], r1 + vld1.8 {q4}, [r2], r3 + + vabal.u8 q12, d6, d14 + vabal.u8 q13, d7, d15 + + vld1.8 {q1}, [r0], r1 + vld1.8 {q5}, [r2], r3 + + vabal.u8 q12, d0, d8 + vabal.u8 q13, d1, d9 + + vld1.8 {q2}, [r0], r1 + vld1.8 {q6}, [r2], r3 + + vabal.u8 q12, d2, d10 + vabal.u8 q13, d3, d11 + + vld1.8 {q3}, [r0], r1 + vld1.8 {q7}, [r2], r3 + + vabal.u8 q12, d4, d12 + vabal.u8 q13, d5, d13 + +;; + vld1.8 {q0}, [r0], r1 + vld1.8 {q4}, [r2], r3 + + vabal.u8 q12, d6, d14 + vabal.u8 q13, d7, d15 + + vld1.8 {q1}, [r0], r1 + vld1.8 {q5}, [r2], r3 + + vabal.u8 q12, d0, d8 + vabal.u8 q13, d1, d9 + + vld1.8 {q2}, [r0], r1 + vld1.8 {q6}, [r2], r3 + + vabal.u8 q12, d2, d10 + vabal.u8 q13, d3, d11 + + vld1.8 {q3}, [r0], r1 + vld1.8 {q7}, [r2], r3 + + vabal.u8 q12, d4, d12 + vabal.u8 q13, d5, d13 + +;; + vld1.8 {q0}, [r0], r1 + vld1.8 {q4}, [r2], r3 + + vabal.u8 q12, d6, d14 + vabal.u8 q13, d7, d15 + + vld1.8 {q1}, [r0], r1 + vld1.8 {q5}, [r2], r3 + + vabal.u8 q12, d0, d8 + vabal.u8 q13, d1, d9 + + vld1.8 {q2}, [r0], r1 + vld1.8 {q6}, [r2], r3 + + vabal.u8 q12, d2, d10 + vabal.u8 q13, d3, d11 + + vld1.8 {q3}, [r0] + vld1.8 {q7}, [r2] + + vabal.u8 q12, d4, d12 + vabal.u8 q13, d5, d13 + + vabal.u8 q12, d6, d14 + vabal.u8 q13, d7, d15 + + vadd.u16 q0, q12, q13 + + vpaddl.u16 q1, q0 + vpaddl.u32 q0, q1 + + vadd.u32 d0, d0, d1 + + vmov.32 r0, d0[0] + + bx lr + + ENDP + +;============================== +;unsigned int vp8_sad16x8_c( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +|vp8_sad16x8_neon| PROC + vld1.8 {q0}, [r0], r1 + vld1.8 {q4}, [r2], r3 + + vld1.8 {q1}, [r0], r1 + vld1.8 {q5}, [r2], r3 + + vabdl.u8 q12, d0, d8 + vabdl.u8 q13, d1, d9 + + vld1.8 {q2}, [r0], r1 + vld1.8 {q6}, [r2], r3 + + vabal.u8 q12, d2, d10 + vabal.u8 q13, d3, d11 + + vld1.8 {q3}, [r0], r1 + vld1.8 {q7}, [r2], r3 + + vabal.u8 q12, d4, d12 + vabal.u8 q13, d5, d13 + + vld1.8 {q0}, [r0], r1 + vld1.8 {q4}, [r2], r3 + + vabal.u8 q12, d6, d14 + vabal.u8 q13, d7, d15 + + vld1.8 {q1}, [r0], r1 + vld1.8 {q5}, [r2], r3 + + vabal.u8 q12, d0, d8 + vabal.u8 q13, d1, d9 + + vld1.8 {q2}, [r0], r1 + vld1.8 {q6}, [r2], r3 + + vabal.u8 q12, d2, d10 + vabal.u8 q13, d3, d11 + + vld1.8 {q3}, [r0], r1 + vld1.8 {q7}, [r2], r3 + + vabal.u8 q12, d4, d12 + vabal.u8 q13, d5, d13 + + vabal.u8 q12, d6, d14 + vabal.u8 q13, d7, d15 + + vadd.u16 q0, q12, q13 + + vpaddl.u16 q1, q0 + vpaddl.u32 q0, q1 + + vadd.u32 d0, d0, d1 + + vmov.32 r0, d0[0] + + bx lr + + ENDP + + END diff --git a/vp8/common/arm/neon/sad8_neon.asm b/vp8/common/arm/neon/sad8_neon.asm new file mode 100644 index 0000000000000000000000000000000000000000..23ba6df93a4dd8856e96396b0372cb646329d264 --- /dev/null +++ b/vp8/common/arm/neon/sad8_neon.asm @@ -0,0 +1,209 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_sad8x8_neon| + EXPORT |vp8_sad8x16_neon| + EXPORT |vp8_sad4x4_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; unsigned int vp8_sad8x8_c( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) + +|vp8_sad8x8_neon| PROC + vld1.8 {d0}, [r0], r1 + vld1.8 {d8}, [r2], r3 + + vld1.8 {d2}, [r0], r1 + vld1.8 {d10}, [r2], r3 + + vabdl.u8 q12, d0, d8 + + vld1.8 {d4}, [r0], r1 + vld1.8 {d12}, [r2], r3 + + vabal.u8 q12, d2, d10 + + vld1.8 {d6}, [r0], r1 + vld1.8 {d14}, [r2], r3 + + vabal.u8 q12, d4, d12 + + vld1.8 {d0}, [r0], r1 + vld1.8 {d8}, [r2], r3 + + vabal.u8 q12, d6, d14 + + vld1.8 {d2}, [r0], r1 + vld1.8 {d10}, [r2], r3 + + vabal.u8 q12, d0, d8 + + vld1.8 {d4}, [r0], r1 + vld1.8 {d12}, [r2], r3 + + vabal.u8 q12, d2, d10 + + vld1.8 {d6}, [r0], r1 + vld1.8 {d14}, [r2], r3 + + vabal.u8 q12, d4, d12 + vabal.u8 q12, d6, d14 + + vpaddl.u16 q1, q12 + vpaddl.u32 q0, q1 + vadd.u32 d0, d0, d1 + + vmov.32 r0, d0[0] + + bx lr + + ENDP + +;============================ +;unsigned int vp8_sad8x16_c( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) + +|vp8_sad8x16_neon| PROC + vld1.8 {d0}, [r0], r1 + vld1.8 {d8}, [r2], r3 + + vld1.8 {d2}, [r0], r1 + vld1.8 {d10}, [r2], r3 + + vabdl.u8 q12, d0, d8 + + vld1.8 {d4}, [r0], r1 + vld1.8 {d12}, [r2], r3 + + vabal.u8 q12, d2, d10 + + vld1.8 {d6}, [r0], r1 + vld1.8 {d14}, [r2], r3 + + vabal.u8 q12, d4, d12 + + vld1.8 {d0}, [r0], r1 + vld1.8 {d8}, [r2], r3 + + vabal.u8 q12, d6, d14 + + vld1.8 {d2}, [r0], r1 + vld1.8 {d10}, [r2], r3 + + vabal.u8 q12, d0, d8 + + vld1.8 {d4}, [r0], r1 + vld1.8 {d12}, [r2], r3 + + vabal.u8 q12, d2, d10 + + vld1.8 {d6}, [r0], r1 + vld1.8 {d14}, [r2], r3 + + vabal.u8 q12, d4, d12 + + vld1.8 {d0}, [r0], r1 + vld1.8 {d8}, [r2], r3 + + vabal.u8 q12, d6, d14 + + vld1.8 {d2}, [r0], r1 + vld1.8 {d10}, [r2], r3 + + vabal.u8 q12, d0, d8 + + vld1.8 {d4}, [r0], r1 + vld1.8 {d12}, [r2], r3 + + vabal.u8 q12, d2, d10 + + vld1.8 {d6}, [r0], r1 + vld1.8 {d14}, [r2], r3 + + vabal.u8 q12, d4, d12 + + vld1.8 {d0}, [r0], r1 + vld1.8 {d8}, [r2], r3 + + vabal.u8 q12, d6, d14 + + vld1.8 {d2}, [r0], r1 + vld1.8 {d10}, [r2], r3 + + vabal.u8 q12, d0, d8 + + vld1.8 {d4}, [r0], r1 + vld1.8 {d12}, [r2], r3 + + vabal.u8 q12, d2, d10 + + vld1.8 {d6}, [r0], r1 + vld1.8 {d14}, [r2], r3 + + vabal.u8 q12, d4, d12 + vabal.u8 q12, d6, d14 + + vpaddl.u16 q1, q12 + vpaddl.u32 q0, q1 + vadd.u32 d0, d0, d1 + + vmov.32 r0, d0[0] + + bx lr + + ENDP + +;=========================== +;unsigned int vp8_sad4x4_c( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) + +|vp8_sad4x4_neon| PROC + vld1.8 {d0}, [r0], r1 + vld1.8 {d8}, [r2], r3 + + vld1.8 {d2}, [r0], r1 + vld1.8 {d10}, [r2], r3 + + vabdl.u8 q12, d0, d8 + + vld1.8 {d4}, [r0], r1 + vld1.8 {d12}, [r2], r3 + + vabal.u8 q12, d2, d10 + + vld1.8 {d6}, [r0], r1 + vld1.8 {d14}, [r2], r3 + + vabal.u8 q12, d4, d12 + vabal.u8 q12, d6, d14 + + vpaddl.u16 d1, d24 + vpaddl.u32 d0, d1 + vmov.32 r0, d0[0] + + bx lr + + ENDP + + END diff --git a/vp8/common/arm/neon/save_reg_neon.asm b/vp8/common/arm/neon/save_reg_neon.asm new file mode 100644 index 0000000000000000000000000000000000000000..fd7002e7a9ec64a73e0bad3cb00fc30763b04f3f --- /dev/null +++ b/vp8/common/arm/neon/save_reg_neon.asm @@ -0,0 +1,36 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_push_neon| + EXPORT |vp8_pop_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vp8_push_neon| PROC + vst1.i64 {d8, d9, d10, d11}, [r0]! + vst1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + + ENDP + +|vp8_pop_neon| PROC + vld1.i64 {d8, d9, d10, d11}, [r0]! + vld1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + + ENDP + + END + diff --git a/vp8/common/arm/neon/shortidct4x4llm_neon.asm b/vp8/common/arm/neon/shortidct4x4llm_neon.asm new file mode 100644 index 0000000000000000000000000000000000000000..67d2ab0150ddeea8ba17318a4d6e35bcad412b1b --- /dev/null +++ b/vp8/common/arm/neon/shortidct4x4llm_neon.asm @@ -0,0 +1,139 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_short_idct4x4llm_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;************************************************************* +;void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, +; unsigned char *dst, int stride) +;r0 short * input +;r1 short * pred +;r2 int pitch +;r3 unsigned char dst +;sp int stride +;************************************************************* + +; static const int cospi8sqrt2minus1=20091; +; static const int sinpi8sqrt2 =35468; +; static const int rounding = 0; + +; Optimization note: The resulted data from dequantization are signed +; 13-bit data that is in the range of [-4096, 4095]. This allows to +; use "vqdmulh"(neon) instruction since it won't go out of range +; (13+16+1=30bits<32bits). This instruction gives the high half +; result of the multiplication that is needed in IDCT. + +|vp8_short_idct4x4llm_neon| PROC + adr r12, idct_coeff + vld1.16 {q1, q2}, [r0] + vld1.16 {d0}, [r12] + + vswp d3, d4 ;q2(vp[4] vp[12]) + ldr r0, [sp] ; stride + + vqdmulh.s16 q3, q2, d0[2] + vqdmulh.s16 q4, q2, d0[0] + + vqadd.s16 d12, d2, d3 ;a1 + vqsub.s16 d13, d2, d3 ;b1 + + vshr.s16 q3, q3, #1 + vshr.s16 q4, q4, #1 + + vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number) + vqadd.s16 q4, q4, q2 + + ;d6 - c1:temp1 + ;d7 - d1:temp2 + ;d8 - d1:temp1 + ;d9 - c1:temp2 + + vqsub.s16 d10, d6, d9 ;c1 + vqadd.s16 d11, d7, d8 ;d1 + + vqadd.s16 d2, d12, d11 + vqadd.s16 d3, d13, d10 + vqsub.s16 d4, d13, d10 + vqsub.s16 d5, d12, d11 + + vtrn.32 d2, d4 + vtrn.32 d3, d5 + vtrn.16 d2, d3 + vtrn.16 d4, d5 + + vswp d3, d4 + + vqdmulh.s16 q3, q2, d0[2] + vqdmulh.s16 q4, q2, d0[0] + + vqadd.s16 d12, d2, d3 ;a1 + vqsub.s16 d13, d2, d3 ;b1 + + vshr.s16 q3, q3, #1 + vshr.s16 q4, q4, #1 + + vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number) + vqadd.s16 q4, q4, q2 + + vqsub.s16 d10, d6, d9 ;c1 + vqadd.s16 d11, d7, d8 ;d1 + + vqadd.s16 d2, d12, d11 + vqadd.s16 d3, d13, d10 + vqsub.s16 d4, d13, d10 + vqsub.s16 d5, d12, d11 + + vrshr.s16 d2, d2, #3 + vrshr.s16 d3, d3, #3 + vrshr.s16 d4, d4, #3 + vrshr.s16 d5, d5, #3 + + vtrn.32 d2, d4 + vtrn.32 d3, d5 + vtrn.16 d2, d3 + vtrn.16 d4, d5 + + ; load prediction data + vld1.32 d6[0], [r1], r2 + vld1.32 d6[1], [r1], r2 + vld1.32 d7[0], [r1], r2 + vld1.32 d7[1], [r1], r2 + + ; add prediction and residual + vaddw.u8 q1, q1, d6 + vaddw.u8 q2, q2, d7 + + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q2 + + ; store to destination + vst1.32 d1[0], [r3], r0 + vst1.32 d1[1], [r3], r0 + vst1.32 d2[0], [r3], r0 + vst1.32 d2[1], [r3], r0 + + bx lr + + ENDP + +;----------------- + +idct_coeff + DCD 0x4e7b4e7b, 0x8a8c8a8c + +;20091, 20091, 35468, 35468 + + END diff --git a/vp8/common/arm/neon/sixtappredict16x16_neon.asm b/vp8/common/arm/neon/sixtappredict16x16_neon.asm new file mode 100644 index 0000000000000000000000000000000000000000..9fdafd3609ed3d1d39a60b48536bbad4fd0fb83c --- /dev/null +++ b/vp8/common/arm/neon/sixtappredict16x16_neon.asm @@ -0,0 +1,490 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_sixtap_predict16x16_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +filter16_coeff + DCD 0, 0, 128, 0, 0, 0, 0, 0 + DCD 0, -6, 123, 12, -1, 0, 0, 0 + DCD 2, -11, 108, 36, -8, 1, 0, 0 + DCD 0, -9, 93, 50, -6, 0, 0, 0 + DCD 3, -16, 77, 77, -16, 3, 0, 0 + DCD 0, -6, 50, 93, -9, 0, 0, 0 + DCD 1, -8, 36, 108, -11, 2, 0, 0 + DCD 0, -1, 12, 123, -6, 0, 0, 0 + +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; r4 unsigned char *dst_ptr, +; stack(r5) int dst_pitch + +;Note: To take advantage of 8-bit mulplication instruction in NEON. First apply abs() to +; filter coeffs to make them u8. Then, use vmlsl for negtive coeffs. After multiplication, +; the result can be negtive. So, I treat the result as s16. But, since it is also possible +; that the result can be a large positive number (> 2^15-1), which could be confused as a +; negtive number. To avoid that error, apply filter coeffs in the order of 0, 1, 4 ,5 ,2, +; which ensures that the result stays in s16 range. Finally, saturated add the result by +; applying 3rd filter coeff. Same applys to other filter functions. + +|vp8_sixtap_predict16x16_neon| PROC + push {r4-r5, lr} + + adr r12, filter16_coeff + ldr r4, [sp, #12] ;load parameters from stack + ldr r5, [sp, #16] ;load parameters from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq secondpass_filter16x16_only + + add r2, r12, r2, lsl #5 ;calculate filter location + + cmp r3, #0 ;skip second_pass filter if yoffset=0 + + vld1.s32 {q14, q15}, [r2] ;load first_pass filter + + beq firstpass_filter16x16_only + + sub sp, sp, #336 ;reserve space on stack for temporary storage + mov lr, sp + + vabs.s32 q12, q14 + vabs.s32 q13, q15 + + mov r2, #7 ;loop counter + sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) + sub r0, r0, r1, lsl #1 + + vdup.8 d0, d24[0] ;first_pass filter (d0-d5) + vdup.8 d1, d24[4] + vdup.8 d2, d25[0] + vdup.8 d3, d25[4] + vdup.8 d4, d26[0] + vdup.8 d5, d26[4] + +;First Pass: output_height lines x output_width columns (21x16) +filt_blk2d_fp16x16_loop_neon + vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data + vld1.u8 {d9, d10, d11}, [r0], r1 + vld1.u8 {d12, d13, d14}, [r0], r1 + + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q9, d7, d0 + vmull.u8 q10, d9, d0 + vmull.u8 q11, d10, d0 + vmull.u8 q12, d12, d0 + vmull.u8 q13, d13, d0 + + vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] + vext.8 d29, d9, d10, #1 + vext.8 d30, d12, d13, #1 + + vmlsl.u8 q8, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q10, d29, d1 + vmlsl.u8 q12, d30, d1 + + vext.8 d28, d7, d8, #1 + vext.8 d29, d10, d11, #1 + vext.8 d30, d13, d14, #1 + + vmlsl.u8 q9, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q11, d29, d1 + vmlsl.u8 q13, d30, d1 + + vext.8 d28, d6, d7, #4 ;construct src_ptr[2] + vext.8 d29, d9, d10, #4 + vext.8 d30, d12, d13, #4 + + vmlsl.u8 q8, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q10, d29, d4 + vmlsl.u8 q12, d30, d4 + + vext.8 d28, d7, d8, #4 + vext.8 d29, d10, d11, #4 + vext.8 d30, d13, d14, #4 + + vmlsl.u8 q9, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q11, d29, d4 + vmlsl.u8 q13, d30, d4 + + vext.8 d28, d6, d7, #5 ;construct src_ptr[3] + vext.8 d29, d9, d10, #5 + vext.8 d30, d12, d13, #5 + + vmlal.u8 q8, d28, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q10, d29, d5 + vmlal.u8 q12, d30, d5 + + vext.8 d28, d7, d8, #5 + vext.8 d29, d10, d11, #5 + vext.8 d30, d13, d14, #5 + + vmlal.u8 q9, d28, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q11, d29, d5 + vmlal.u8 q13, d30, d5 + + vext.8 d28, d6, d7, #2 ;construct src_ptr[0] + vext.8 d29, d9, d10, #2 + vext.8 d30, d12, d13, #2 + + vmlal.u8 q8, d28, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q10, d29, d2 + vmlal.u8 q12, d30, d2 + + vext.8 d28, d7, d8, #2 + vext.8 d29, d10, d11, #2 + vext.8 d30, d13, d14, #2 + + vmlal.u8 q9, d28, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q11, d29, d2 + vmlal.u8 q13, d30, d2 + + vext.8 d28, d6, d7, #3 ;construct src_ptr[1] + vext.8 d29, d9, d10, #3 + vext.8 d30, d12, d13, #3 + + vext.8 d15, d7, d8, #3 + vext.8 d31, d10, d11, #3 + vext.8 d6, d13, d14, #3 + + vmull.u8 q4, d28, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q5, d29, d3 + vmull.u8 q6, d30, d3 + + vqadd.s16 q8, q4 ;sum of all (src_data*filter_parameters) + vqadd.s16 q10, q5 + vqadd.s16 q12, q6 + + vmull.u8 q6, d15, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q7, d31, d3 + vmull.u8 q3, d6, d3 + + subs r2, r2, #1 + + vqadd.s16 q9, q6 + vqadd.s16 q11, q7 + vqadd.s16 q13, q3 + + vqrshrun.s16 d6, q8, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d7, q9, #7 + vqrshrun.s16 d8, q10, #7 + vqrshrun.s16 d9, q11, #7 + vqrshrun.s16 d10, q12, #7 + vqrshrun.s16 d11, q13, #7 + + vst1.u8 {d6, d7, d8}, [lr]! ;store result + vst1.u8 {d9, d10, d11}, [lr]! + + bne filt_blk2d_fp16x16_loop_neon + +;Second pass: 16x16 +;secondpass_filter - do first 8-columns and then second 8-columns + add r3, r12, r3, lsl #5 + sub lr, lr, #336 + + vld1.s32 {q5, q6}, [r3] ;load second_pass filter + mov r3, #2 ;loop counter + + vabs.s32 q7, q5 + vabs.s32 q8, q6 + + mov r2, #16 + + vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) + vdup.8 d1, d14[4] + vdup.8 d2, d15[0] + vdup.8 d3, d15[4] + vdup.8 d4, d16[0] + vdup.8 d5, d16[4] + +filt_blk2d_sp16x16_outloop_neon + vld1.u8 {d18}, [lr], r2 ;load src data + vld1.u8 {d19}, [lr], r2 + vld1.u8 {d20}, [lr], r2 + vld1.u8 {d21}, [lr], r2 + mov r12, #4 ;loop counter + vld1.u8 {d22}, [lr], r2 + +secondpass_inner_loop_neon + vld1.u8 {d23}, [lr], r2 ;load src data + vld1.u8 {d24}, [lr], r2 + vld1.u8 {d25}, [lr], r2 + vld1.u8 {d26}, [lr], r2 + + vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q4, d19, d0 + vmull.u8 q5, d20, d0 + vmull.u8 q6, d21, d0 + + vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q4, d20, d1 + vmlsl.u8 q5, d21, d1 + vmlsl.u8 q6, d22, d1 + + vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q4, d23, d4 + vmlsl.u8 q5, d24, d4 + vmlsl.u8 q6, d25, d4 + + vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q4, d21, d2 + vmlal.u8 q5, d22, d2 + vmlal.u8 q6, d23, d2 + + vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q4, d24, d5 + vmlal.u8 q5, d25, d5 + vmlal.u8 q6, d26, d5 + + vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q8, d22, d3 + vmull.u8 q9, d23, d3 + vmull.u8 q10, d24, d3 + + subs r12, r12, #1 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d7, q8, #7 + vqrshrun.s16 d8, q9, #7 + vqrshrun.s16 d9, q10, #7 + + vst1.u8 {d6}, [r4], r5 ;store result + vmov q9, q11 + vst1.u8 {d7}, [r4], r5 + vmov q10, q12 + vst1.u8 {d8}, [r4], r5 + vmov d22, d26 + vst1.u8 {d9}, [r4], r5 + + bne secondpass_inner_loop_neon + + subs r3, r3, #1 + sub lr, lr, #336 + add lr, lr, #8 + + sub r4, r4, r5, lsl #4 + add r4, r4, #8 + + bne filt_blk2d_sp16x16_outloop_neon + + add sp, sp, #336 + pop {r4-r5,pc} + +;-------------------- +firstpass_filter16x16_only + vabs.s32 q12, q14 + vabs.s32 q13, q15 + + mov r2, #8 ;loop counter + sub r0, r0, #2 ;move srcptr back to (column-2) + + vdup.8 d0, d24[0] ;first_pass filter (d0-d5) + vdup.8 d1, d24[4] + vdup.8 d2, d25[0] + vdup.8 d3, d25[4] + vdup.8 d4, d26[0] + vdup.8 d5, d26[4] + +;First Pass: output_height lines x output_width columns (16x16) +filt_blk2d_fpo16x16_loop_neon + vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data + vld1.u8 {d9, d10, d11}, [r0], r1 + + pld [r0] + pld [r0, r1] + + vmull.u8 q6, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q7, d7, d0 + vmull.u8 q8, d9, d0 + vmull.u8 q9, d10, d0 + + vext.8 d20, d6, d7, #1 ;construct src_ptr[-1] + vext.8 d21, d9, d10, #1 + vext.8 d22, d7, d8, #1 + vext.8 d23, d10, d11, #1 + vext.8 d24, d6, d7, #4 ;construct src_ptr[2] + vext.8 d25, d9, d10, #4 + vext.8 d26, d7, d8, #4 + vext.8 d27, d10, d11, #4 + vext.8 d28, d6, d7, #5 ;construct src_ptr[3] + vext.8 d29, d9, d10, #5 + + vmlsl.u8 q6, d20, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q8, d21, d1 + vmlsl.u8 q7, d22, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q9, d23, d1 + vmlsl.u8 q6, d24, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q8, d25, d4 + vmlsl.u8 q7, d26, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q9, d27, d4 + vmlal.u8 q6, d28, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q8, d29, d5 + + vext.8 d20, d7, d8, #5 + vext.8 d21, d10, d11, #5 + vext.8 d22, d6, d7, #2 ;construct src_ptr[0] + vext.8 d23, d9, d10, #2 + vext.8 d24, d7, d8, #2 + vext.8 d25, d10, d11, #2 + + vext.8 d26, d6, d7, #3 ;construct src_ptr[1] + vext.8 d27, d9, d10, #3 + vext.8 d28, d7, d8, #3 + vext.8 d29, d10, d11, #3 + + vmlal.u8 q7, d20, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q9, d21, d5 + vmlal.u8 q6, d22, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q8, d23, d2 + vmlal.u8 q7, d24, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q9, d25, d2 + + vmull.u8 q10, d26, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q11, d27, d3 + vmull.u8 q12, d28, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q15, d29, d3 + + vqadd.s16 q6, q10 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q11 + vqadd.s16 q7, q12 + vqadd.s16 q9, q15 + + subs r2, r2, #1 + + vqrshrun.s16 d6, q6, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d7, q7, #7 + vqrshrun.s16 d8, q8, #7 + vqrshrun.s16 d9, q9, #7 + + vst1.u8 {q3}, [r4], r5 ;store result + vst1.u8 {q4}, [r4], r5 + + bne filt_blk2d_fpo16x16_loop_neon + + pop {r4-r5,pc} + +;-------------------- +secondpass_filter16x16_only +;Second pass: 16x16 + add r3, r12, r3, lsl #5 + sub r0, r0, r1, lsl #1 + + vld1.s32 {q5, q6}, [r3] ;load second_pass filter + mov r3, #2 ;loop counter + + vabs.s32 q7, q5 + vabs.s32 q8, q6 + + vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) + vdup.8 d1, d14[4] + vdup.8 d2, d15[0] + vdup.8 d3, d15[4] + vdup.8 d4, d16[0] + vdup.8 d5, d16[4] + +filt_blk2d_spo16x16_outloop_neon + vld1.u8 {d18}, [r0], r1 ;load src data + vld1.u8 {d19}, [r0], r1 + vld1.u8 {d20}, [r0], r1 + vld1.u8 {d21}, [r0], r1 + mov r12, #4 ;loop counter + vld1.u8 {d22}, [r0], r1 + +secondpass_only_inner_loop_neon + vld1.u8 {d23}, [r0], r1 ;load src data + vld1.u8 {d24}, [r0], r1 + vld1.u8 {d25}, [r0], r1 + vld1.u8 {d26}, [r0], r1 + + vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q4, d19, d0 + vmull.u8 q5, d20, d0 + vmull.u8 q6, d21, d0 + + vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q4, d20, d1 + vmlsl.u8 q5, d21, d1 + vmlsl.u8 q6, d22, d1 + + vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q4, d23, d4 + vmlsl.u8 q5, d24, d4 + vmlsl.u8 q6, d25, d4 + + vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q4, d21, d2 + vmlal.u8 q5, d22, d2 + vmlal.u8 q6, d23, d2 + + vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q4, d24, d5 + vmlal.u8 q5, d25, d5 + vmlal.u8 q6, d26, d5 + + vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q8, d22, d3 + vmull.u8 q9, d23, d3 + vmull.u8 q10, d24, d3 + + subs r12, r12, #1 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d7, q8, #7 + vqrshrun.s16 d8, q9, #7 + vqrshrun.s16 d9, q10, #7 + + vst1.u8 {d6}, [r4], r5 ;store result + vmov q9, q11 + vst1.u8 {d7}, [r4], r5 + vmov q10, q12 + vst1.u8 {d8}, [r4], r5 + vmov d22, d26 + vst1.u8 {d9}, [r4], r5 + + bne secondpass_only_inner_loop_neon + + subs r3, r3, #1 + sub r0, r0, r1, lsl #4 + sub r0, r0, r1, lsl #2 + sub r0, r0, r1 + add r0, r0, #8 + + sub r4, r4, r5, lsl #4 + add r4, r4, #8 + + bne filt_blk2d_spo16x16_outloop_neon + + pop {r4-r5,pc} + + ENDP + +;----------------- + END diff --git a/vp8/common/arm/neon/sixtappredict4x4_neon.asm b/vp8/common/arm/neon/sixtappredict4x4_neon.asm new file mode 100644 index 0000000000000000000000000000000000000000..a4222bc62c54d750b1cfbe2ec2505962adad3f5f --- /dev/null +++ b/vp8/common/arm/neon/sixtappredict4x4_neon.asm @@ -0,0 +1,422 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_sixtap_predict4x4_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +filter4_coeff + DCD 0, 0, 128, 0, 0, 0, 0, 0 + DCD 0, -6, 123, 12, -1, 0, 0, 0 + DCD 2, -11, 108, 36, -8, 1, 0, 0 + DCD 0, -9, 93, 50, -6, 0, 0, 0 + DCD 3, -16, 77, 77, -16, 3, 0, 0 + DCD 0, -6, 50, 93, -9, 0, 0, 0 + DCD 1, -8, 36, 108, -11, 2, 0, 0 + DCD 0, -1, 12, 123, -6, 0, 0, 0 + +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; stack(r4) unsigned char *dst_ptr, +; stack(lr) int dst_pitch + +|vp8_sixtap_predict4x4_neon| PROC + push {r4, lr} + + adr r12, filter4_coeff + ldr r4, [sp, #8] ;load parameters from stack + ldr lr, [sp, #12] ;load parameters from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq secondpass_filter4x4_only + + add r2, r12, r2, lsl #5 ;calculate filter location + + cmp r3, #0 ;skip second_pass filter if yoffset=0 + vld1.s32 {q14, q15}, [r2] ;load first_pass filter + + beq firstpass_filter4x4_only + + vabs.s32 q12, q14 ;get abs(filer_parameters) + vabs.s32 q13, q15 + + sub r0, r0, #2 ;go back 2 columns of src data + sub r0, r0, r1, lsl #1 ;go back 2 lines of src data + +;First pass: output_height lines x output_width columns (9x4) + vld1.u8 {q3}, [r0], r1 ;load first 4-line src data + vdup.8 d0, d24[0] ;first_pass filter (d0-d5) + vld1.u8 {q4}, [r0], r1 + vdup.8 d1, d24[4] + vld1.u8 {q5}, [r0], r1 + vdup.8 d2, d25[0] + vld1.u8 {q6}, [r0], r1 + vdup.8 d3, d25[4] + vdup.8 d4, d26[0] + vdup.8 d5, d26[4] + + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vext.8 d18, d6, d7, #5 ;construct src_ptr[3] + vext.8 d19, d8, d9, #5 + vext.8 d20, d10, d11, #5 + vext.8 d21, d12, d13, #5 + + vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done + vswp d11, d12 + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3]) + vzip.32 d20, d21 + vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5]) + vmull.u8 q8, d20, d5 + + vmov q4, q3 ;keep original src data in q4 q6 + vmov q6, q5 + + vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together + vzip.32 d10, d11 + vshr.u64 q9, q4, #8 ;construct src_ptr[-1] + vshr.u64 q10, q6, #8 + vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0]) + vmlal.u8 q8, d10, d0 + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1]) + vzip.32 d20, d21 + vshr.u64 q3, q4, #32 ;construct src_ptr[2] + vshr.u64 q5, q6, #32 + vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q8, d20, d1 + + vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2]) + vzip.32 d10, d11 + vshr.u64 q9, q4, #16 ;construct src_ptr[0] + vshr.u64 q10, q6, #16 + vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q8, d10, d4 + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0]) + vzip.32 d20, d21 + vshr.u64 q3, q4, #24 ;construct src_ptr[1] + vshr.u64 q5, q6, #24 + vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q8, d20, d2 + + vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1]) + vzip.32 d10, d11 + vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q10, d10, d3 + + vld1.u8 {q3}, [r0], r1 ;load rest 5-line src data + vld1.u8 {q4}, [r0], r1 + + vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q10 + + vld1.u8 {q5}, [r0], r1 + vld1.u8 {q6}, [r0], r1 + + vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d28, q8, #7 + + ;First Pass on rest 5-line data + vld1.u8 {q11}, [r0], r1 + + vext.8 d18, d6, d7, #5 ;construct src_ptr[3] + vext.8 d19, d8, d9, #5 + vext.8 d20, d10, d11, #5 + vext.8 d21, d12, d13, #5 + + vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done + vswp d11, d12 + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3]) + vzip.32 d20, d21 + vext.8 d31, d22, d23, #5 ;construct src_ptr[3] + vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5]) + vmull.u8 q8, d20, d5 + vmull.u8 q12, d31, d5 ;(src_ptr[3] * vp8_filter[5]) + + vmov q4, q3 ;keep original src data in q4 q6 + vmov q6, q5 + + vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together + vzip.32 d10, d11 + vshr.u64 q9, q4, #8 ;construct src_ptr[-1] + vshr.u64 q10, q6, #8 + + vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0]) + vmlal.u8 q8, d10, d0 + vmlal.u8 q12, d22, d0 ;(src_ptr[-2] * vp8_filter[0]) + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1]) + vzip.32 d20, d21 + vshr.u64 q3, q4, #32 ;construct src_ptr[2] + vshr.u64 q5, q6, #32 + vext.8 d31, d22, d23, #1 ;construct src_ptr[-1] + + vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q8, d20, d1 + vmlsl.u8 q12, d31, d1 ;-(src_ptr[-1] * vp8_filter[1]) + + vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2]) + vzip.32 d10, d11 + vshr.u64 q9, q4, #16 ;construct src_ptr[0] + vshr.u64 q10, q6, #16 + vext.8 d31, d22, d23, #4 ;construct src_ptr[2] + + vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q8, d10, d4 + vmlsl.u8 q12, d31, d4 ;-(src_ptr[2] * vp8_filter[4]) + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0]) + vzip.32 d20, d21 + vshr.u64 q3, q4, #24 ;construct src_ptr[1] + vshr.u64 q5, q6, #24 + vext.8 d31, d22, d23, #2 ;construct src_ptr[0] + + vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q8, d20, d2 + vmlal.u8 q12, d31, d2 ;(src_ptr[0] * vp8_filter[2]) + + vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1]) + vzip.32 d10, d11 + vext.8 d31, d22, d23, #3 ;construct src_ptr[1] + vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q10, d10, d3 + vmull.u8 q11, d31, d3 ;(src_ptr[1] * vp8_filter[3]) + + add r3, r12, r3, lsl #5 + + vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q10 + vqadd.s16 q12, q11 + + vext.8 d23, d27, d28, #4 + vld1.s32 {q5, q6}, [r3] ;load second_pass filter + + vqrshrun.s16 d29, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d30, q8, #7 + vqrshrun.s16 d31, q12, #7 + +;Second pass: 4x4 + vabs.s32 q7, q5 + vabs.s32 q8, q6 + + vext.8 d24, d28, d29, #4 + vext.8 d25, d29, d30, #4 + vext.8 d26, d30, d31, #4 + + vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) + vdup.8 d1, d14[4] + vdup.8 d2, d15[0] + vdup.8 d3, d15[4] + vdup.8 d4, d16[0] + vdup.8 d5, d16[4] + + vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q4, d28, d0 + + vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp8_filter[5]) + vmull.u8 q6, d26, d5 + + vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q4, d30, d4 + + vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q6, d24, d1 + + vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q4, d29, d2 + + vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp8_filter[3]) + vmlal.u8 q6, d25, d3 + + add r0, r4, lr + add r1, r0, lr + add r2, r1, lr + + vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q6, q4 + + vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d4, q6, #7 + + vst1.32 {d3[0]}, [r4] ;store result + vst1.32 {d3[1]}, [r0] + vst1.32 {d4[0]}, [r1] + vst1.32 {d4[1]}, [r2] + + pop {r4, pc} + + +;--------------------- +firstpass_filter4x4_only + vabs.s32 q12, q14 ;get abs(filer_parameters) + vabs.s32 q13, q15 + + sub r0, r0, #2 ;go back 2 columns of src data + +;First pass: output_height lines x output_width columns (4x4) + vld1.u8 {q3}, [r0], r1 ;load first 4-line src data + vdup.8 d0, d24[0] ;first_pass filter (d0-d5) + vld1.u8 {q4}, [r0], r1 + vdup.8 d1, d24[4] + vld1.u8 {q5}, [r0], r1 + vdup.8 d2, d25[0] + vld1.u8 {q6}, [r0], r1 + + vdup.8 d3, d25[4] + vdup.8 d4, d26[0] + vdup.8 d5, d26[4] + + vext.8 d18, d6, d7, #5 ;construct src_ptr[3] + vext.8 d19, d8, d9, #5 + vext.8 d20, d10, d11, #5 + vext.8 d21, d12, d13, #5 + + vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done + vswp d11, d12 + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3]) + vzip.32 d20, d21 + vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5]) + vmull.u8 q8, d20, d5 + + vmov q4, q3 ;keep original src data in q4 q6 + vmov q6, q5 + + vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together + vzip.32 d10, d11 + vshr.u64 q9, q4, #8 ;construct src_ptr[-1] + vshr.u64 q10, q6, #8 + vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0]) + vmlal.u8 q8, d10, d0 + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1]) + vzip.32 d20, d21 + vshr.u64 q3, q4, #32 ;construct src_ptr[2] + vshr.u64 q5, q6, #32 + vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q8, d20, d1 + + vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2]) + vzip.32 d10, d11 + vshr.u64 q9, q4, #16 ;construct src_ptr[0] + vshr.u64 q10, q6, #16 + vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q8, d10, d4 + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0]) + vzip.32 d20, d21 + vshr.u64 q3, q4, #24 ;construct src_ptr[1] + vshr.u64 q5, q6, #24 + vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q8, d20, d2 + + vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1]) + vzip.32 d10, d11 + vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q10, d10, d3 + + add r0, r4, lr + add r1, r0, lr + add r2, r1, lr + + vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q10 + + vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d28, q8, #7 + + vst1.32 {d27[0]}, [r4] ;store result + vst1.32 {d27[1]}, [r0] + vst1.32 {d28[0]}, [r1] + vst1.32 {d28[1]}, [r2] + + pop {r4, pc} + + +;--------------------- +secondpass_filter4x4_only + sub r0, r0, r1, lsl #1 + add r3, r12, r3, lsl #5 + + vld1.32 {d27[0]}, [r0], r1 ;load src data + vld1.s32 {q5, q6}, [r3] ;load second_pass filter + vld1.32 {d27[1]}, [r0], r1 + vabs.s32 q7, q5 + vld1.32 {d28[0]}, [r0], r1 + vabs.s32 q8, q6 + vld1.32 {d28[1]}, [r0], r1 + vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) + vld1.32 {d29[0]}, [r0], r1 + vdup.8 d1, d14[4] + vld1.32 {d29[1]}, [r0], r1 + vdup.8 d2, d15[0] + vld1.32 {d30[0]}, [r0], r1 + vdup.8 d3, d15[4] + vld1.32 {d30[1]}, [r0], r1 + vdup.8 d4, d16[0] + vld1.32 {d31[0]}, [r0], r1 + vdup.8 d5, d16[4] + + vext.8 d23, d27, d28, #4 + vext.8 d24, d28, d29, #4 + vext.8 d25, d29, d30, #4 + vext.8 d26, d30, d31, #4 + + vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q4, d28, d0 + + vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp8_filter[5]) + vmull.u8 q6, d26, d5 + + vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q4, d30, d4 + + vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q6, d24, d1 + + vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q4, d29, d2 + + vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp8_filter[3]) + vmlal.u8 q6, d25, d3 + + add r0, r4, lr + add r1, r0, lr + add r2, r1, lr + + vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q6, q4 + + vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d4, q6, #7 + + vst1.32 {d3[0]}, [r4] ;store result + vst1.32 {d3[1]}, [r0] + vst1.32 {d4[0]}, [r1] + vst1.32 {d4[1]}, [r2] + + pop {r4, pc} + + ENDP + +;----------------- + + END diff --git a/vp8/common/arm/neon/sixtappredict8x4_neon.asm b/vp8/common/arm/neon/sixtappredict8x4_neon.asm new file mode 100644 index 0000000000000000000000000000000000000000..a57ec015f2c0c7404732cbeded169dd911b79a88 --- /dev/null +++ b/vp8/common/arm/neon/sixtappredict8x4_neon.asm @@ -0,0 +1,473 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_sixtap_predict8x4_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +filter8_coeff + DCD 0, 0, 128, 0, 0, 0, 0, 0 + DCD 0, -6, 123, 12, -1, 0, 0, 0 + DCD 2, -11, 108, 36, -8, 1, 0, 0 + DCD 0, -9, 93, 50, -6, 0, 0, 0 + DCD 3, -16, 77, 77, -16, 3, 0, 0 + DCD 0, -6, 50, 93, -9, 0, 0, 0 + DCD 1, -8, 36, 108, -11, 2, 0, 0 + DCD 0, -1, 12, 123, -6, 0, 0, 0 + +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; r4 unsigned char *dst_ptr, +; stack(r5) int dst_pitch + +|vp8_sixtap_predict8x4_neon| PROC + push {r4-r5, lr} + + adr r12, filter8_coeff + ldr r4, [sp, #12] ;load parameters from stack + ldr r5, [sp, #16] ;load parameters from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq secondpass_filter8x4_only + + add r2, r12, r2, lsl #5 ;calculate filter location + + cmp r3, #0 ;skip second_pass filter if yoffset=0 + + vld1.s32 {q14, q15}, [r2] ;load first_pass filter + + beq firstpass_filter8x4_only + + sub sp, sp, #32 ;reserve space on stack for temporary storage + vabs.s32 q12, q14 + vabs.s32 q13, q15 + + sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) + mov lr, sp + sub r0, r0, r1, lsl #1 + + vdup.8 d0, d24[0] ;first_pass filter (d0-d5) + vdup.8 d1, d24[4] + vdup.8 d2, d25[0] + +;First pass: output_height lines x output_width columns (9x8) + vld1.u8 {q3}, [r0], r1 ;load src data + vdup.8 d3, d25[4] + vld1.u8 {q4}, [r0], r1 + vdup.8 d4, d26[0] + vld1.u8 {q5}, [r0], r1 + vdup.8 d5, d26[4] + vld1.u8 {q6}, [r0], r1 + + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q8, d8, d0 + vmull.u8 q9, d10, d0 + vmull.u8 q10, d12, d0 + + vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] + vext.8 d29, d8, d9, #1 + vext.8 d30, d10, d11, #1 + vext.8 d31, d12, d13, #1 + + vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q8, d29, d1 + vmlsl.u8 q9, d30, d1 + vmlsl.u8 q10, d31, d1 + + vext.8 d28, d6, d7, #4 ;construct src_ptr[2] + vext.8 d29, d8, d9, #4 + vext.8 d30, d10, d11, #4 + vext.8 d31, d12, d13, #4 + + vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q8, d29, d4 + vmlsl.u8 q9, d30, d4 + vmlsl.u8 q10, d31, d4 + + vext.8 d28, d6, d7, #2 ;construct src_ptr[0] + vext.8 d29, d8, d9, #2 + vext.8 d30, d10, d11, #2 + vext.8 d31, d12, d13, #2 + + vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q8, d29, d2 + vmlal.u8 q9, d30, d2 + vmlal.u8 q10, d31, d2 + + vext.8 d28, d6, d7, #5 ;construct src_ptr[3] + vext.8 d29, d8, d9, #5 + vext.8 d30, d10, d11, #5 + vext.8 d31, d12, d13, #5 + + vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q8, d29, d5 + vmlal.u8 q9, d30, d5 + vmlal.u8 q10, d31, d5 + + vext.8 d28, d6, d7, #3 ;construct src_ptr[1] + vext.8 d29, d8, d9, #3 + vext.8 d30, d10, d11, #3 + vext.8 d31, d12, d13, #3 + + vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q4, d29, d3 + vmull.u8 q5, d30, d3 + vmull.u8 q6, d31, d3 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vld1.u8 {q3}, [r0], r1 ;load src data + + vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d23, q8, #7 + vqrshrun.s16 d24, q9, #7 + vqrshrun.s16 d25, q10, #7 + + vld1.u8 {q4}, [r0], r1 + vst1.u8 {d22}, [lr]! ;store result + vld1.u8 {q5}, [r0], r1 + vst1.u8 {d23}, [lr]! + vld1.u8 {q6}, [r0], r1 + vst1.u8 {d24}, [lr]! + vld1.u8 {q7}, [r0], r1 + vst1.u8 {d25}, [lr]! + + ;first_pass filtering on the rest 5-line data + vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q9, d8, d0 + vmull.u8 q10, d10, d0 + vmull.u8 q11, d12, d0 + vmull.u8 q12, d14, d0 + + vext.8 d27, d6, d7, #1 ;construct src_ptr[-1] + vext.8 d28, d8, d9, #1 + vext.8 d29, d10, d11, #1 + vext.8 d30, d12, d13, #1 + vext.8 d31, d14, d15, #1 + + vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q9, d28, d1 + vmlsl.u8 q10, d29, d1 + vmlsl.u8 q11, d30, d1 + vmlsl.u8 q12, d31, d1 + + vext.8 d27, d6, d7, #4 ;construct src_ptr[2] + vext.8 d28, d8, d9, #4 + vext.8 d29, d10, d11, #4 + vext.8 d30, d12, d13, #4 + vext.8 d31, d14, d15, #4 + + vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q9, d28, d4 + vmlsl.u8 q10, d29, d4 + vmlsl.u8 q11, d30, d4 + vmlsl.u8 q12, d31, d4 + + vext.8 d27, d6, d7, #2 ;construct src_ptr[0] + vext.8 d28, d8, d9, #2 + vext.8 d29, d10, d11, #2 + vext.8 d30, d12, d13, #2 + vext.8 d31, d14, d15, #2 + + vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q9, d28, d2 + vmlal.u8 q10, d29, d2 + vmlal.u8 q11, d30, d2 + vmlal.u8 q12, d31, d2 + + vext.8 d27, d6, d7, #5 ;construct src_ptr[3] + vext.8 d28, d8, d9, #5 + vext.8 d29, d10, d11, #5 + vext.8 d30, d12, d13, #5 + vext.8 d31, d14, d15, #5 + + vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q9, d28, d5 + vmlal.u8 q10, d29, d5 + vmlal.u8 q11, d30, d5 + vmlal.u8 q12, d31, d5 + + vext.8 d27, d6, d7, #3 ;construct src_ptr[1] + vext.8 d28, d8, d9, #3 + vext.8 d29, d10, d11, #3 + vext.8 d30, d12, d13, #3 + vext.8 d31, d14, d15, #3 + + vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q4, d28, d3 + vmull.u8 q5, d29, d3 + vmull.u8 q6, d30, d3 + vmull.u8 q7, d31, d3 + + vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q9, q4 + vqadd.s16 q10, q5 + vqadd.s16 q11, q6 + vqadd.s16 q12, q7 + + vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d27, q9, #7 + vqrshrun.s16 d28, q10, #7 + vqrshrun.s16 d29, q11, #7 ;load intermediate data from stack + vqrshrun.s16 d30, q12, #7 + +;Second pass: 8x4 +;secondpass_filter + add r3, r12, r3, lsl #5 + sub lr, lr, #32 + + vld1.s32 {q5, q6}, [r3] ;load second_pass filter + vld1.u8 {q11}, [lr]! + + vabs.s32 q7, q5 + vabs.s32 q8, q6 + + vld1.u8 {q12}, [lr]! + + vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) + vdup.8 d1, d14[4] + vdup.8 d2, d15[0] + vdup.8 d3, d15[4] + vdup.8 d4, d16[0] + vdup.8 d5, d16[4] + + vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q4, d23, d0 + vmull.u8 q5, d24, d0 + vmull.u8 q6, d25, d0 + + vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q4, d24, d1 + vmlsl.u8 q5, d25, d1 + vmlsl.u8 q6, d26, d1 + + vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q4, d27, d4 + vmlsl.u8 q5, d28, d4 + vmlsl.u8 q6, d29, d4 + + vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q4, d25, d2 + vmlal.u8 q5, d26, d2 + vmlal.u8 q6, d27, d2 + + vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q4, d28, d5 + vmlal.u8 q5, d29, d5 + vmlal.u8 q6, d30, d5 + + vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q8, d26, d3 + vmull.u8 q9, d27, d3 + vmull.u8 q10, d28, d3 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d7, q8, #7 + vqrshrun.s16 d8, q9, #7 + vqrshrun.s16 d9, q10, #7 + + vst1.u8 {d6}, [r4], r5 ;store result + vst1.u8 {d7}, [r4], r5 + vst1.u8 {d8}, [r4], r5 + vst1.u8 {d9}, [r4], r5 + + add sp, sp, #32 + pop {r4-r5,pc} + +;-------------------- +firstpass_filter8x4_only + vabs.s32 q12, q14 + vabs.s32 q13, q15 + + sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) + vld1.u8 {q3}, [r0], r1 ;load src data + + vdup.8 d0, d24[0] ;first_pass filter (d0-d5) + vld1.u8 {q4}, [r0], r1 + vdup.8 d1, d24[4] + vld1.u8 {q5}, [r0], r1 + vdup.8 d2, d25[0] + vld1.u8 {q6}, [r0], r1 + vdup.8 d3, d25[4] + vdup.8 d4, d26[0] + vdup.8 d5, d26[4] + +;First pass: output_height lines x output_width columns (4x8) + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q8, d8, d0 + vmull.u8 q9, d10, d0 + vmull.u8 q10, d12, d0 + + vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] + vext.8 d29, d8, d9, #1 + vext.8 d30, d10, d11, #1 + vext.8 d31, d12, d13, #1 + + vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q8, d29, d1 + vmlsl.u8 q9, d30, d1 + vmlsl.u8 q10, d31, d1 + + vext.8 d28, d6, d7, #4 ;construct src_ptr[2] + vext.8 d29, d8, d9, #4 + vext.8 d30, d10, d11, #4 + vext.8 d31, d12, d13, #4 + + vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q8, d29, d4 + vmlsl.u8 q9, d30, d4 + vmlsl.u8 q10, d31, d4 + + vext.8 d28, d6, d7, #2 ;construct src_ptr[0] + vext.8 d29, d8, d9, #2 + vext.8 d30, d10, d11, #2 + vext.8 d31, d12, d13, #2 + + vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q8, d29, d2 + vmlal.u8 q9, d30, d2 + vmlal.u8 q10, d31, d2 + + vext.8 d28, d6, d7, #5 ;construct src_ptr[3] + vext.8 d29, d8, d9, #5 + vext.8 d30, d10, d11, #5 + vext.8 d31, d12, d13, #5 + + vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q8, d29, d5 + vmlal.u8 q9, d30, d5 + vmlal.u8 q10, d31, d5 + + vext.8 d28, d6, d7, #3 ;construct src_ptr[1] + vext.8 d29, d8, d9, #3 + vext.8 d30, d10, d11, #3 + vext.8 d31, d12, d13, #3 + + vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q4, d29, d3 + vmull.u8 q5, d30, d3 + vmull.u8 q6, d31, d3 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d23, q8, #7 + vqrshrun.s16 d24, q9, #7 + vqrshrun.s16 d25, q10, #7 + + vst1.u8 {d22}, [r4], r5 ;store result + vst1.u8 {d23}, [r4], r5 + vst1.u8 {d24}, [r4], r5 + vst1.u8 {d25}, [r4], r5 + + pop {r4-r5,pc} + +;--------------------- +secondpass_filter8x4_only +;Second pass: 8x4 + add r3, r12, r3, lsl #5 + sub r0, r0, r1, lsl #1 + vld1.s32 {q5, q6}, [r3] ;load second_pass filter + vabs.s32 q7, q5 + vabs.s32 q8, q6 + + vld1.u8 {d22}, [r0], r1 + vld1.u8 {d23}, [r0], r1 + vld1.u8 {d24}, [r0], r1 + vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) + vld1.u8 {d25}, [r0], r1 + vdup.8 d1, d14[4] + vld1.u8 {d26}, [r0], r1 + vdup.8 d2, d15[0] + vld1.u8 {d27}, [r0], r1 + vdup.8 d3, d15[4] + vld1.u8 {d28}, [r0], r1 + vdup.8 d4, d16[0] + vld1.u8 {d29}, [r0], r1 + vdup.8 d5, d16[4] + vld1.u8 {d30}, [r0], r1 + + vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q4, d23, d0 + vmull.u8 q5, d24, d0 + vmull.u8 q6, d25, d0 + + vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q4, d24, d1 + vmlsl.u8 q5, d25, d1 + vmlsl.u8 q6, d26, d1 + + vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q4, d27, d4 + vmlsl.u8 q5, d28, d4 + vmlsl.u8 q6, d29, d4 + + vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q4, d25, d2 + vmlal.u8 q5, d26, d2 + vmlal.u8 q6, d27, d2 + + vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q4, d28, d5 + vmlal.u8 q5, d29, d5 + vmlal.u8 q6, d30, d5 + + vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q8, d26, d3 + vmull.u8 q9, d27, d3 + vmull.u8 q10, d28, d3 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d7, q8, #7 + vqrshrun.s16 d8, q9, #7 + vqrshrun.s16 d9, q10, #7 + + vst1.u8 {d6}, [r4], r5 ;store result + vst1.u8 {d7}, [r4], r5 + vst1.u8 {d8}, [r4], r5 + vst1.u8 {d9}, [r4], r5 + + pop {r4-r5,pc} + + ENDP + +;----------------- + + END diff --git a/vp8/common/arm/neon/sixtappredict8x8_neon.asm b/vp8/common/arm/neon/sixtappredict8x8_neon.asm new file mode 100644 index 0000000000000000000000000000000000000000..00ed5aeefe3c9cb4458cf15f8322eda756f4792f --- /dev/null +++ b/vp8/common/arm/neon/sixtappredict8x8_neon.asm @@ -0,0 +1,524 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_sixtap_predict8x8_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +filter8_coeff + DCD 0, 0, 128, 0, 0, 0, 0, 0 + DCD 0, -6, 123, 12, -1, 0, 0, 0 + DCD 2, -11, 108, 36, -8, 1, 0, 0 + DCD 0, -9, 93, 50, -6, 0, 0, 0 + DCD 3, -16, 77, 77, -16, 3, 0, 0 + DCD 0, -6, 50, 93, -9, 0, 0, 0 + DCD 1, -8, 36, 108, -11, 2, 0, 0 + DCD 0, -1, 12, 123, -6, 0, 0, 0 + +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; stack(r4) unsigned char *dst_ptr, +; stack(r5) int dst_pitch + +|vp8_sixtap_predict8x8_neon| PROC + push {r4-r5, lr} + + adr r12, filter8_coeff + + ldr r4, [sp, #12] ;load parameters from stack + ldr r5, [sp, #16] ;load parameters from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq secondpass_filter8x8_only + + add r2, r12, r2, lsl #5 ;calculate filter location + + cmp r3, #0 ;skip second_pass filter if yoffset=0 + + vld1.s32 {q14, q15}, [r2] ;load first_pass filter + + beq firstpass_filter8x8_only + + sub sp, sp, #64 ;reserve space on stack for temporary storage + mov lr, sp + + vabs.s32 q12, q14 + vabs.s32 q13, q15 + + mov r2, #2 ;loop counter + sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) + sub r0, r0, r1, lsl #1 + + vdup.8 d0, d24[0] ;first_pass filter (d0-d5) + vdup.8 d1, d24[4] + vdup.8 d2, d25[0] + +;First pass: output_height lines x output_width columns (13x8) + vld1.u8 {q3}, [r0], r1 ;load src data + vdup.8 d3, d25[4] + vld1.u8 {q4}, [r0], r1 + vdup.8 d4, d26[0] + vld1.u8 {q5}, [r0], r1 + vdup.8 d5, d26[4] + vld1.u8 {q6}, [r0], r1 + +filt_blk2d_fp8x8_loop_neon + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q8, d8, d0 + vmull.u8 q9, d10, d0 + vmull.u8 q10, d12, d0 + + vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] + vext.8 d29, d8, d9, #1 + vext.8 d30, d10, d11, #1 + vext.8 d31, d12, d13, #1 + + vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q8, d29, d1 + vmlsl.u8 q9, d30, d1 + vmlsl.u8 q10, d31, d1 + + vext.8 d28, d6, d7, #4 ;construct src_ptr[2] + vext.8 d29, d8, d9, #4 + vext.8 d30, d10, d11, #4 + vext.8 d31, d12, d13, #4 + + vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q8, d29, d4 + vmlsl.u8 q9, d30, d4 + vmlsl.u8 q10, d31, d4 + + vext.8 d28, d6, d7, #2 ;construct src_ptr[0] + vext.8 d29, d8, d9, #2 + vext.8 d30, d10, d11, #2 + vext.8 d31, d12, d13, #2 + + vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q8, d29, d2 + vmlal.u8 q9, d30, d2 + vmlal.u8 q10, d31, d2 + + vext.8 d28, d6, d7, #5 ;construct src_ptr[3] + vext.8 d29, d8, d9, #5 + vext.8 d30, d10, d11, #5 + vext.8 d31, d12, d13, #5 + + vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q8, d29, d5 + vmlal.u8 q9, d30, d5 + vmlal.u8 q10, d31, d5 + + vext.8 d28, d6, d7, #3 ;construct src_ptr[1] + vext.8 d29, d8, d9, #3 + vext.8 d30, d10, d11, #3 + vext.8 d31, d12, d13, #3 + + vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q4, d29, d3 + vmull.u8 q5, d30, d3 + vmull.u8 q6, d31, d3 + + subs r2, r2, #1 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vld1.u8 {q3}, [r0], r1 ;load src data + + vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d23, q8, #7 + vqrshrun.s16 d24, q9, #7 + vqrshrun.s16 d25, q10, #7 + + vst1.u8 {d22}, [lr]! ;store result + vld1.u8 {q4}, [r0], r1 + vst1.u8 {d23}, [lr]! + vld1.u8 {q5}, [r0], r1 + vst1.u8 {d24}, [lr]! + vld1.u8 {q6}, [r0], r1 + vst1.u8 {d25}, [lr]! + + bne filt_blk2d_fp8x8_loop_neon + + ;first_pass filtering on the rest 5-line data + ;vld1.u8 {q3}, [r0], r1 ;load src data + ;vld1.u8 {q4}, [r0], r1 + ;vld1.u8 {q5}, [r0], r1 + ;vld1.u8 {q6}, [r0], r1 + vld1.u8 {q7}, [r0], r1 + + vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q9, d8, d0 + vmull.u8 q10, d10, d0 + vmull.u8 q11, d12, d0 + vmull.u8 q12, d14, d0 + + vext.8 d27, d6, d7, #1 ;construct src_ptr[-1] + vext.8 d28, d8, d9, #1 + vext.8 d29, d10, d11, #1 + vext.8 d30, d12, d13, #1 + vext.8 d31, d14, d15, #1 + + vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q9, d28, d1 + vmlsl.u8 q10, d29, d1 + vmlsl.u8 q11, d30, d1 + vmlsl.u8 q12, d31, d1 + + vext.8 d27, d6, d7, #4 ;construct src_ptr[2] + vext.8 d28, d8, d9, #4 + vext.8 d29, d10, d11, #4 + vext.8 d30, d12, d13, #4 + vext.8 d31, d14, d15, #4 + + vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q9, d28, d4 + vmlsl.u8 q10, d29, d4 + vmlsl.u8 q11, d30, d4 + vmlsl.u8 q12, d31, d4 + + vext.8 d27, d6, d7, #2 ;construct src_ptr[0] + vext.8 d28, d8, d9, #2 + vext.8 d29, d10, d11, #2 + vext.8 d30, d12, d13, #2 + vext.8 d31, d14, d15, #2 + + vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q9, d28, d2 + vmlal.u8 q10, d29, d2 + vmlal.u8 q11, d30, d2 + vmlal.u8 q12, d31, d2 + + vext.8 d27, d6, d7, #5 ;construct src_ptr[3] + vext.8 d28, d8, d9, #5 + vext.8 d29, d10, d11, #5 + vext.8 d30, d12, d13, #5 + vext.8 d31, d14, d15, #5 + + vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q9, d28, d5 + vmlal.u8 q10, d29, d5 + vmlal.u8 q11, d30, d5 + vmlal.u8 q12, d31, d5 + + vext.8 d27, d6, d7, #3 ;construct src_ptr[1] + vext.8 d28, d8, d9, #3 + vext.8 d29, d10, d11, #3 + vext.8 d30, d12, d13, #3 + vext.8 d31, d14, d15, #3 + + vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q4, d28, d3 + vmull.u8 q5, d29, d3 + vmull.u8 q6, d30, d3 + vmull.u8 q7, d31, d3 + + vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q9, q4 + vqadd.s16 q10, q5 + vqadd.s16 q11, q6 + vqadd.s16 q12, q7 + + add r3, r12, r3, lsl #5 + + vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8 + sub lr, lr, #64 + vqrshrun.s16 d27, q9, #7 + vld1.u8 {q9}, [lr]! ;load intermediate data from stack + vqrshrun.s16 d28, q10, #7 + vld1.u8 {q10}, [lr]! + + vld1.s32 {q5, q6}, [r3] ;load second_pass filter + + vqrshrun.s16 d29, q11, #7 + vld1.u8 {q11}, [lr]! + + vabs.s32 q7, q5 + vabs.s32 q8, q6 + + vqrshrun.s16 d30, q12, #7 + vld1.u8 {q12}, [lr]! + +;Second pass: 8x8 + mov r3, #2 ;loop counter + + vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) + vdup.8 d1, d14[4] + vdup.8 d2, d15[0] + vdup.8 d3, d15[4] + vdup.8 d4, d16[0] + vdup.8 d5, d16[4] + +filt_blk2d_sp8x8_loop_neon + vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q4, d19, d0 + vmull.u8 q5, d20, d0 + vmull.u8 q6, d21, d0 + + vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q4, d20, d1 + vmlsl.u8 q5, d21, d1 + vmlsl.u8 q6, d22, d1 + + vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q4, d23, d4 + vmlsl.u8 q5, d24, d4 + vmlsl.u8 q6, d25, d4 + + vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q4, d21, d2 + vmlal.u8 q5, d22, d2 + vmlal.u8 q6, d23, d2 + + vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q4, d24, d5 + vmlal.u8 q5, d25, d5 + vmlal.u8 q6, d26, d5 + + vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q8, d22, d3 + vmull.u8 q9, d23, d3 + vmull.u8 q10, d24, d3 + + subs r3, r3, #1 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d7, q8, #7 + vqrshrun.s16 d8, q9, #7 + vqrshrun.s16 d9, q10, #7 + + vmov q9, q11 + vst1.u8 {d6}, [r4], r5 ;store result + vmov q10, q12 + vst1.u8 {d7}, [r4], r5 + vmov q11, q13 + vst1.u8 {d8}, [r4], r5 + vmov q12, q14 + vst1.u8 {d9}, [r4], r5 + vmov d26, d30 + + bne filt_blk2d_sp8x8_loop_neon + + add sp, sp, #64 + pop {r4-r5,pc} + +;--------------------- +firstpass_filter8x8_only + ;add r2, r12, r2, lsl #5 ;calculate filter location + ;vld1.s32 {q14, q15}, [r2] ;load first_pass filter + vabs.s32 q12, q14 + vabs.s32 q13, q15 + + mov r2, #2 ;loop counter + sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) + + vdup.8 d0, d24[0] ;first_pass filter (d0-d5) + vdup.8 d1, d24[4] + vdup.8 d2, d25[0] + vdup.8 d3, d25[4] + vdup.8 d4, d26[0] + vdup.8 d5, d26[4] + +;First pass: output_height lines x output_width columns (8x8) +filt_blk2d_fpo8x8_loop_neon + vld1.u8 {q3}, [r0], r1 ;load src data + vld1.u8 {q4}, [r0], r1 + vld1.u8 {q5}, [r0], r1 + vld1.u8 {q6}, [r0], r1 + + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q8, d8, d0 + vmull.u8 q9, d10, d0 + vmull.u8 q10, d12, d0 + + vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] + vext.8 d29, d8, d9, #1 + vext.8 d30, d10, d11, #1 + vext.8 d31, d12, d13, #1 + + vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q8, d29, d1 + vmlsl.u8 q9, d30, d1 + vmlsl.u8 q10, d31, d1 + + vext.8 d28, d6, d7, #4 ;construct src_ptr[2] + vext.8 d29, d8, d9, #4 + vext.8 d30, d10, d11, #4 + vext.8 d31, d12, d13, #4 + + vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q8, d29, d4 + vmlsl.u8 q9, d30, d4 + vmlsl.u8 q10, d31, d4 + + vext.8 d28, d6, d7, #2 ;construct src_ptr[0] + vext.8 d29, d8, d9, #2 + vext.8 d30, d10, d11, #2 + vext.8 d31, d12, d13, #2 + + vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q8, d29, d2 + vmlal.u8 q9, d30, d2 + vmlal.u8 q10, d31, d2 + + vext.8 d28, d6, d7, #5 ;construct src_ptr[3] + vext.8 d29, d8, d9, #5 + vext.8 d30, d10, d11, #5 + vext.8 d31, d12, d13, #5 + + vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q8, d29, d5 + vmlal.u8 q9, d30, d5 + vmlal.u8 q10, d31, d5 + + vext.8 d28, d6, d7, #3 ;construct src_ptr[1] + vext.8 d29, d8, d9, #3 + vext.8 d30, d10, d11, #3 + vext.8 d31, d12, d13, #3 + + vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q4, d29, d3 + vmull.u8 q5, d30, d3 + vmull.u8 q6, d31, d3 + ; + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + subs r2, r2, #1 + + vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d23, q8, #7 + vqrshrun.s16 d24, q9, #7 + vqrshrun.s16 d25, q10, #7 + + vst1.u8 {d22}, [r4], r5 ;store result + vst1.u8 {d23}, [r4], r5 + vst1.u8 {d24}, [r4], r5 + vst1.u8 {d25}, [r4], r5 + + bne filt_blk2d_fpo8x8_loop_neon + + pop {r4-r5,pc} + +;--------------------- +secondpass_filter8x8_only + sub r0, r0, r1, lsl #1 + add r3, r12, r3, lsl #5 + + vld1.u8 {d18}, [r0], r1 ;load src data + vld1.s32 {q5, q6}, [r3] ;load second_pass filter + vld1.u8 {d19}, [r0], r1 + vabs.s32 q7, q5 + vld1.u8 {d20}, [r0], r1 + vabs.s32 q8, q6 + vld1.u8 {d21}, [r0], r1 + mov r3, #2 ;loop counter + vld1.u8 {d22}, [r0], r1 + vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) + vld1.u8 {d23}, [r0], r1 + vdup.8 d1, d14[4] + vld1.u8 {d24}, [r0], r1 + vdup.8 d2, d15[0] + vld1.u8 {d25}, [r0], r1 + vdup.8 d3, d15[4] + vld1.u8 {d26}, [r0], r1 + vdup.8 d4, d16[0] + vld1.u8 {d27}, [r0], r1 + vdup.8 d5, d16[4] + vld1.u8 {d28}, [r0], r1 + vld1.u8 {d29}, [r0], r1 + vld1.u8 {d30}, [r0], r1 + +;Second pass: 8x8 +filt_blk2d_spo8x8_loop_neon + vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q4, d19, d0 + vmull.u8 q5, d20, d0 + vmull.u8 q6, d21, d0 + + vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q4, d20, d1 + vmlsl.u8 q5, d21, d1 + vmlsl.u8 q6, d22, d1 + + vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q4, d23, d4 + vmlsl.u8 q5, d24, d4 + vmlsl.u8 q6, d25, d4 + + vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q4, d21, d2 + vmlal.u8 q5, d22, d2 + vmlal.u8 q6, d23, d2 + + vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q4, d24, d5 + vmlal.u8 q5, d25, d5 + vmlal.u8 q6, d26, d5 + + vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q8, d22, d3 + vmull.u8 q9, d23, d3 + vmull.u8 q10, d24, d3 + + subs r3, r3, #1 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d7, q8, #7 + vqrshrun.s16 d8, q9, #7 + vqrshrun.s16 d9, q10, #7 + + vmov q9, q11 + vst1.u8 {d6}, [r4], r5 ;store result + vmov q10, q12 + vst1.u8 {d7}, [r4], r5 + vmov q11, q13 + vst1.u8 {d8}, [r4], r5 + vmov q12, q14 + vst1.u8 {d9}, [r4], r5 + vmov d26, d30 + + bne filt_blk2d_spo8x8_loop_neon + + pop {r4-r5,pc} + + ENDP + +;----------------- + + END diff --git a/vp8/common/arm/neon/variance_neon.asm b/vp8/common/arm/neon/variance_neon.asm new file mode 100644 index 0000000000000000000000000000000000000000..e3b48327d3f5d2a9eb85d35ddcc20ea07ef01d22 --- /dev/null +++ b/vp8/common/arm/neon/variance_neon.asm @@ -0,0 +1,276 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_variance16x16_neon| + EXPORT |vp8_variance16x8_neon| + EXPORT |vp8_variance8x16_neon| + EXPORT |vp8_variance8x8_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vp8_variance16x16_neon| PROC + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + + mov r12, #8 + +variance16x16_neon_loop + vld1.8 {q0}, [r0], r1 ;Load up source and reference + vld1.8 {q2}, [r2], r3 + vld1.8 {q1}, [r0], r1 + vld1.8 {q3}, [r2], r3 + + vsubl.u8 q11, d0, d4 ;calculate diff + vsubl.u8 q12, d1, d5 + vsubl.u8 q13, d2, d6 + vsubl.u8 q14, d3, d7 + + ;VPADAL adds adjacent pairs of elements of a vector, and accumulates + ;the results into the elements of the destination vector. The explanation + ;in ARM guide is wrong. + vpadal.s16 q8, q11 ;calculate sum + vmlal.s16 q9, d22, d22 ;calculate sse + vmlal.s16 q10, d23, d23 + + subs r12, r12, #1 + + vpadal.s16 q8, q12 + vmlal.s16 q9, d24, d24 + vmlal.s16 q10, d25, d25 + vpadal.s16 q8, q13 + vmlal.s16 q9, d26, d26 + vmlal.s16 q10, d27, d27 + vpadal.s16 q8, q14 + vmlal.s16 q9, d28, d28 + vmlal.s16 q10, d29, d29 + + bne variance16x16_neon_loop + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + ldr r12, [sp] ;load *sse from stack + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + ;vmov.32 r0, d0[0] ;this instruction costs a lot + ;vmov.32 r1, d1[0] + ;mul r0, r0, r0 + ;str r1, [r12] + ;sub r0, r1, r0, lsr #8 + + ; while sum is signed, sum * sum is always positive and must be treated as + ; unsigned to avoid propagating the sign bit. + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [r12] ;store sse + vshr.u32 d10, d10, #8 + vsub.u32 d0, d1, d10 + + vmov.32 r0, d0[0] ;return + bx lr + + ENDP + +;================================ +;unsigned int vp8_variance16x8_c( +; unsigned char *src_ptr, +; int source_stride, +; unsigned char *ref_ptr, +; int recon_stride, +; unsigned int *sse) +|vp8_variance16x8_neon| PROC + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + + mov r12, #4 + +variance16x8_neon_loop + vld1.8 {q0}, [r0], r1 ;Load up source and reference + vld1.8 {q2}, [r2], r3 + vld1.8 {q1}, [r0], r1 + vld1.8 {q3}, [r2], r3 + + vsubl.u8 q11, d0, d4 ;calculate diff + vsubl.u8 q12, d1, d5 + vsubl.u8 q13, d2, d6 + vsubl.u8 q14, d3, d7 + + vpadal.s16 q8, q11 ;calculate sum + vmlal.s16 q9, d22, d22 ;calculate sse + vmlal.s16 q10, d23, d23 + + subs r12, r12, #1 + + vpadal.s16 q8, q12 + vmlal.s16 q9, d24, d24 + vmlal.s16 q10, d25, d25 + vpadal.s16 q8, q13 + vmlal.s16 q9, d26, d26 + vmlal.s16 q10, d27, d27 + vpadal.s16 q8, q14 + vmlal.s16 q9, d28, d28 + vmlal.s16 q10, d29, d29 + + bne variance16x8_neon_loop + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + ldr r12, [sp] ;load *sse from stack + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [r12] ;store sse + vshr.u32 d10, d10, #7 + vsub.u32 d0, d1, d10 + + vmov.32 r0, d0[0] ;return + bx lr + + ENDP + +;================================= +;unsigned int vp8_variance8x16_c( +; unsigned char *src_ptr, +; int source_stride, +; unsigned char *ref_ptr, +; int recon_stride, +; unsigned int *sse) + +|vp8_variance8x16_neon| PROC + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + + mov r12, #8 + +variance8x16_neon_loop + vld1.8 {d0}, [r0], r1 ;Load up source and reference + vld1.8 {d4}, [r2], r3 + vld1.8 {d2}, [r0], r1 + vld1.8 {d6}, [r2], r3 + + vsubl.u8 q11, d0, d4 ;calculate diff + vsubl.u8 q12, d2, d6 + + vpadal.s16 q8, q11 ;calculate sum + vmlal.s16 q9, d22, d22 ;calculate sse + vmlal.s16 q10, d23, d23 + + subs r12, r12, #1 + + vpadal.s16 q8, q12 + vmlal.s16 q9, d24, d24 + vmlal.s16 q10, d25, d25 + + bne variance8x16_neon_loop + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + ldr r12, [sp] ;load *sse from stack + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [r12] ;store sse + vshr.u32 d10, d10, #7 + vsub.u32 d0, d1, d10 + + vmov.32 r0, d0[0] ;return + bx lr + + ENDP + +;================================== +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vp8_variance8x8_neon| PROC + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + + mov r12, #2 + +variance8x8_neon_loop + vld1.8 {d0}, [r0], r1 ;Load up source and reference + vld1.8 {d4}, [r2], r3 + vld1.8 {d1}, [r0], r1 + vld1.8 {d5}, [r2], r3 + vld1.8 {d2}, [r0], r1 + vld1.8 {d6}, [r2], r3 + vld1.8 {d3}, [r0], r1 + vld1.8 {d7}, [r2], r3 + + vsubl.u8 q11, d0, d4 ;calculate diff + vsubl.u8 q12, d1, d5 + vsubl.u8 q13, d2, d6 + vsubl.u8 q14, d3, d7 + + vpadal.s16 q8, q11 ;calculate sum + vmlal.s16 q9, d22, d22 ;calculate sse + vmlal.s16 q10, d23, d23 + + subs r12, r12, #1 + + vpadal.s16 q8, q12 + vmlal.s16 q9, d24, d24 + vmlal.s16 q10, d25, d25 + vpadal.s16 q8, q13 + vmlal.s16 q9, d26, d26 + vmlal.s16 q10, d27, d27 + vpadal.s16 q8, q14 + vmlal.s16 q9, d28, d28 + vmlal.s16 q10, d29, d29 + + bne variance8x8_neon_loop + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + ldr r12, [sp] ;load *sse from stack + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [r12] ;store sse + vshr.u32 d10, d10, #6 + vsub.u32 d0, d1, d10 + + vmov.32 r0, d0[0] ;return + bx lr + + ENDP + + END diff --git a/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm b/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm new file mode 100644 index 0000000000000000000000000000000000000000..e7a3ed173f3d158b6eefe75c0c5f281675ca4ef6 --- /dev/null +++ b/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm @@ -0,0 +1,423 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +bilinear_taps_coeff + DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 + +;----------------- + + EXPORT |vp8_sub_pixel_variance16x16_neon_func| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; stack(r4) unsigned char *dst_ptr, +; stack(r5) int dst_pixels_per_line, +; stack(r6) unsigned int *sse +;note: most of the code is copied from bilinear_predict16x16_neon and vp8_variance16x16_neon. + +|vp8_sub_pixel_variance16x16_neon_func| PROC + push {r4-r6, lr} + + adr r12, bilinear_taps_coeff + ldr r4, [sp, #16] ;load *dst_ptr from stack + ldr r5, [sp, #20] ;load dst_pixels_per_line from stack + ldr r6, [sp, #24] ;load *sse from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq secondpass_bfilter16x16_only + + add r2, r12, r2, lsl #3 ;calculate filter location + + cmp r3, #0 ;skip second_pass filter if yoffset=0 + + vld1.s32 {d31}, [r2] ;load first_pass filter + + beq firstpass_bfilter16x16_only + + sub sp, sp, #272 ;reserve space on stack for temporary storage + vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data + mov lr, sp + vld1.u8 {d5, d6, d7}, [r0], r1 + + mov r2, #3 ;loop counter + vld1.u8 {d8, d9, d10}, [r0], r1 + + vdup.8 d0, d31[0] ;first_pass filter (d0 d1) + vld1.u8 {d11, d12, d13}, [r0], r1 + + vdup.8 d1, d31[4] + +;First Pass: output_height lines x output_width columns (17x16) +vp8e_filt_blk2d_fp16x16_loop_neon + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0]) + vmull.u8 q8, d3, d0 + vmull.u8 q9, d5, d0 + vmull.u8 q10, d6, d0 + vmull.u8 q11, d8, d0 + vmull.u8 q12, d9, d0 + vmull.u8 q13, d11, d0 + vmull.u8 q14, d12, d0 + + vext.8 d2, d2, d3, #1 ;construct src_ptr[1] + vext.8 d5, d5, d6, #1 + vext.8 d8, d8, d9, #1 + vext.8 d11, d11, d12, #1 + + vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1]) + vmlal.u8 q9, d5, d1 + vmlal.u8 q11, d8, d1 + vmlal.u8 q13, d11, d1 + + vext.8 d3, d3, d4, #1 + vext.8 d6, d6, d7, #1 + vext.8 d9, d9, d10, #1 + vext.8 d12, d12, d13, #1 + + vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1]) + vmlal.u8 q10, d6, d1 + vmlal.u8 q12, d9, d1 + vmlal.u8 q14, d12, d1 + + subs r2, r2, #1 + + vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d15, q8, #7 + vqrshrn.u16 d16, q9, #7 + vqrshrn.u16 d17, q10, #7 + vqrshrn.u16 d18, q11, #7 + vqrshrn.u16 d19, q12, #7 + vqrshrn.u16 d20, q13, #7 + + vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data + vqrshrn.u16 d21, q14, #7 + vld1.u8 {d5, d6, d7}, [r0], r1 + + vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result + vld1.u8 {d8, d9, d10}, [r0], r1 + vst1.u8 {d18, d19, d20, d21}, [lr]! + vld1.u8 {d11, d12, d13}, [r0], r1 + + bne vp8e_filt_blk2d_fp16x16_loop_neon + +;First-pass filtering for rest 5 lines + vld1.u8 {d14, d15, d16}, [r0], r1 + + vmull.u8 q9, d2, d0 ;(src_ptr[0] * Filter[0]) + vmull.u8 q10, d3, d0 + vmull.u8 q11, d5, d0 + vmull.u8 q12, d6, d0 + vmull.u8 q13, d8, d0 + vmull.u8 q14, d9, d0 + + vext.8 d2, d2, d3, #1 ;construct src_ptr[1] + vext.8 d5, d5, d6, #1 + vext.8 d8, d8, d9, #1 + + vmlal.u8 q9, d2, d1 ;(src_ptr[0] * Filter[1]) + vmlal.u8 q11, d5, d1 + vmlal.u8 q13, d8, d1 + + vext.8 d3, d3, d4, #1 + vext.8 d6, d6, d7, #1 + vext.8 d9, d9, d10, #1 + + vmlal.u8 q10, d3, d1 ;(src_ptr[0] * Filter[1]) + vmlal.u8 q12, d6, d1 + vmlal.u8 q14, d9, d1 + + vmull.u8 q1, d11, d0 + vmull.u8 q2, d12, d0 + vmull.u8 q3, d14, d0 + vmull.u8 q4, d15, d0 + + vext.8 d11, d11, d12, #1 ;construct src_ptr[1] + vext.8 d14, d14, d15, #1 + + vmlal.u8 q1, d11, d1 ;(src_ptr[0] * Filter[1]) + vmlal.u8 q3, d14, d1 + + vext.8 d12, d12, d13, #1 + vext.8 d15, d15, d16, #1 + + vmlal.u8 q2, d12, d1 ;(src_ptr[0] * Filter[1]) + vmlal.u8 q4, d15, d1 + + vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d11, q10, #7 + vqrshrn.u16 d12, q11, #7 + vqrshrn.u16 d13, q12, #7 + vqrshrn.u16 d14, q13, #7 + vqrshrn.u16 d15, q14, #7 + vqrshrn.u16 d16, q1, #7 + vqrshrn.u16 d17, q2, #7 + vqrshrn.u16 d18, q3, #7 + vqrshrn.u16 d19, q4, #7 + + vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result + vst1.u8 {d14, d15, d16, d17}, [lr]! + vst1.u8 {d18, d19}, [lr]! + +;Second pass: 16x16 +;secondpass_filter + add r3, r12, r3, lsl #3 + sub lr, lr, #272 + + vld1.u32 {d31}, [r3] ;load second_pass filter + + sub sp, sp, #256 + mov r3, sp + + vld1.u8 {d22, d23}, [lr]! ;load src data + + vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) + vdup.8 d1, d31[4] + mov r12, #4 ;loop counter + +vp8e_filt_blk2d_sp16x16_loop_neon + vld1.u8 {d24, d25}, [lr]! + vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0]) + vld1.u8 {d26, d27}, [lr]! + vmull.u8 q2, d23, d0 + vld1.u8 {d28, d29}, [lr]! + vmull.u8 q3, d24, d0 + vld1.u8 {d30, d31}, [lr]! + + vmull.u8 q4, d25, d0 + vmull.u8 q5, d26, d0 + vmull.u8 q6, d27, d0 + vmull.u8 q7, d28, d0 + vmull.u8 q8, d29, d0 + + vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1]) + vmlal.u8 q2, d25, d1 + vmlal.u8 q3, d26, d1 + vmlal.u8 q4, d27, d1 + vmlal.u8 q5, d28, d1 + vmlal.u8 q6, d29, d1 + vmlal.u8 q7, d30, d1 + vmlal.u8 q8, d31, d1 + + subs r12, r12, #1 + + vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d3, q2, #7 + vqrshrn.u16 d4, q3, #7 + vqrshrn.u16 d5, q4, #7 + vqrshrn.u16 d6, q5, #7 + vqrshrn.u16 d7, q6, #7 + vqrshrn.u16 d8, q7, #7 + vqrshrn.u16 d9, q8, #7 + + vst1.u8 {d2, d3}, [r3]! ;store result + vst1.u8 {d4, d5}, [r3]! + vst1.u8 {d6, d7}, [r3]! + vmov q11, q15 + vst1.u8 {d8, d9}, [r3]! + + bne vp8e_filt_blk2d_sp16x16_loop_neon + + b sub_pixel_variance16x16_neon + +;-------------------- +firstpass_bfilter16x16_only + mov r2, #4 ;loop counter + sub sp, sp, #528 ;reserve space on stack for temporary storage + vdup.8 d0, d31[0] ;first_pass filter (d0 d1) + vdup.8 d1, d31[4] + mov r3, sp + +;First Pass: output_height lines x output_width columns (16x16) +vp8e_filt_blk2d_fpo16x16_loop_neon + vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data + vld1.u8 {d5, d6, d7}, [r0], r1 + vld1.u8 {d8, d9, d10}, [r0], r1 + vld1.u8 {d11, d12, d13}, [r0], r1 + + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0]) + vmull.u8 q8, d3, d0 + vmull.u8 q9, d5, d0 + vmull.u8 q10, d6, d0 + vmull.u8 q11, d8, d0 + vmull.u8 q12, d9, d0 + vmull.u8 q13, d11, d0 + vmull.u8 q14, d12, d0 + + vext.8 d2, d2, d3, #1 ;construct src_ptr[1] + vext.8 d5, d5, d6, #1 + vext.8 d8, d8, d9, #1 + vext.8 d11, d11, d12, #1 + + vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1]) + vmlal.u8 q9, d5, d1 + vmlal.u8 q11, d8, d1 + vmlal.u8 q13, d11, d1 + + vext.8 d3, d3, d4, #1 + vext.8 d6, d6, d7, #1 + vext.8 d9, d9, d10, #1 + vext.8 d12, d12, d13, #1 + + vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1]) + vmlal.u8 q10, d6, d1 + vmlal.u8 q12, d9, d1 + vmlal.u8 q14, d12, d1 + + subs r2, r2, #1 + + vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d15, q8, #7 + vqrshrn.u16 d16, q9, #7 + vqrshrn.u16 d17, q10, #7 + vqrshrn.u16 d18, q11, #7 + vqrshrn.u16 d19, q12, #7 + vqrshrn.u16 d20, q13, #7 + vst1.u8 {d14, d15}, [r3]! ;store result + vqrshrn.u16 d21, q14, #7 + + vst1.u8 {d16, d17}, [r3]! + vst1.u8 {d18, d19}, [r3]! + vst1.u8 {d20, d21}, [r3]! + + bne vp8e_filt_blk2d_fpo16x16_loop_neon + + b sub_pixel_variance16x16_neon + +;--------------------- +secondpass_bfilter16x16_only +;Second pass: 16x16 +;secondpass_filter + sub sp, sp, #528 ;reserve space on stack for temporary storage + add r3, r12, r3, lsl #3 + mov r12, #4 ;loop counter + vld1.u32 {d31}, [r3] ;load second_pass filter + vld1.u8 {d22, d23}, [r0], r1 ;load src data + mov r3, sp + + vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) + vdup.8 d1, d31[4] + +vp8e_filt_blk2d_spo16x16_loop_neon + vld1.u8 {d24, d25}, [r0], r1 + vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0]) + vld1.u8 {d26, d27}, [r0], r1 + vmull.u8 q2, d23, d0 + vld1.u8 {d28, d29}, [r0], r1 + vmull.u8 q3, d24, d0 + vld1.u8 {d30, d31}, [r0], r1 + + vmull.u8 q4, d25, d0 + vmull.u8 q5, d26, d0 + vmull.u8 q6, d27, d0 + vmull.u8 q7, d28, d0 + vmull.u8 q8, d29, d0 + + vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1]) + vmlal.u8 q2, d25, d1 + vmlal.u8 q3, d26, d1 + vmlal.u8 q4, d27, d1 + vmlal.u8 q5, d28, d1 + vmlal.u8 q6, d29, d1 + vmlal.u8 q7, d30, d1 + vmlal.u8 q8, d31, d1 + + vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d3, q2, #7 + vqrshrn.u16 d4, q3, #7 + vqrshrn.u16 d5, q4, #7 + vqrshrn.u16 d6, q5, #7 + vqrshrn.u16 d7, q6, #7 + vqrshrn.u16 d8, q7, #7 + vqrshrn.u16 d9, q8, #7 + + vst1.u8 {d2, d3}, [r3]! ;store result + subs r12, r12, #1 + vst1.u8 {d4, d5}, [r3]! + vmov q11, q15 + vst1.u8 {d6, d7}, [r3]! + vst1.u8 {d8, d9}, [r3]! + + bne vp8e_filt_blk2d_spo16x16_loop_neon + + b sub_pixel_variance16x16_neon + +;---------------------------- +;variance16x16 +sub_pixel_variance16x16_neon + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + + sub r3, r3, #256 + mov r12, #8 + +sub_pixel_variance16x16_neon_loop + vld1.8 {q0}, [r3]! ;Load up source and reference + vld1.8 {q2}, [r4], r5 + vld1.8 {q1}, [r3]! + vld1.8 {q3}, [r4], r5 + + vsubl.u8 q11, d0, d4 ;diff + vsubl.u8 q12, d1, d5 + vsubl.u8 q13, d2, d6 + vsubl.u8 q14, d3, d7 + + vpadal.s16 q8, q11 ;sum + vmlal.s16 q9, d22, d22 ;sse + vmlal.s16 q10, d23, d23 + + subs r12, r12, #1 + + vpadal.s16 q8, q12 + vmlal.s16 q9, d24, d24 + vmlal.s16 q10, d25, d25 + vpadal.s16 q8, q13 + vmlal.s16 q9, d26, d26 + vmlal.s16 q10, d27, d27 + vpadal.s16 q8, q14 + vmlal.s16 q9, d28, d28 + vmlal.s16 q10, d29, d29 + + bne sub_pixel_variance16x16_neon_loop + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [r6] ;store sse + vshr.u32 d10, d10, #8 + vsub.u32 d0, d1, d10 + + add sp, sp, #528 + vmov.32 r0, d0[0] ;return + + pop {r4-r6,pc} + + ENDP + + END diff --git a/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm b/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm new file mode 100644 index 0000000000000000000000000000000000000000..155be4fc54b41c0d603e85f98e5334fcf2a33909 --- /dev/null +++ b/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm @@ -0,0 +1,572 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_variance_halfpixvar16x16_h_neon| + EXPORT |vp8_variance_halfpixvar16x16_v_neon| + EXPORT |vp8_variance_halfpixvar16x16_hv_neon| + EXPORT |vp8_sub_pixel_variance16x16s_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;================================================ +;unsigned int vp8_variance_halfpixvar16x16_h_neon +;( +; unsigned char *src_ptr, r0 +; int src_pixels_per_line, r1 +; unsigned char *dst_ptr, r2 +; int dst_pixels_per_line, r3 +; unsigned int *sse +;); +;================================================ +|vp8_variance_halfpixvar16x16_h_neon| PROC + push {lr} + + mov r12, #4 ;loop counter + ldr lr, [sp, #4] ;load *sse from stack + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + +;First Pass: output_height lines x output_width columns (16x16) +vp8_filt_fpo16x16s_4_0_loop_neon + vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data + vld1.8 {q11}, [r2], r3 + vld1.u8 {d4, d5, d6, d7}, [r0], r1 + vld1.8 {q12}, [r2], r3 + vld1.u8 {d8, d9, d10, d11}, [r0], r1 + vld1.8 {q13}, [r2], r3 + vld1.u8 {d12, d13, d14, d15}, [r0], r1 + + ;pld [r0] + ;pld [r0, r1] + ;pld [r0, r1, lsl #1] + + vext.8 q1, q0, q1, #1 ;construct src_ptr[1] + vext.8 q3, q2, q3, #1 + vext.8 q5, q4, q5, #1 + vext.8 q7, q6, q7, #1 + + vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 + vld1.8 {q14}, [r2], r3 + vrhadd.u8 q1, q2, q3 + vrhadd.u8 q2, q4, q5 + vrhadd.u8 q3, q6, q7 + + vsubl.u8 q4, d0, d22 ;diff + vsubl.u8 q5, d1, d23 + vsubl.u8 q6, d2, d24 + vsubl.u8 q7, d3, d25 + vsubl.u8 q0, d4, d26 + vsubl.u8 q1, d5, d27 + vsubl.u8 q2, d6, d28 + vsubl.u8 q3, d7, d29 + + vpadal.s16 q8, q4 ;sum + vmlal.s16 q9, d8, d8 ;sse + vmlal.s16 q10, d9, d9 + + subs r12, r12, #1 + + vpadal.s16 q8, q5 + vmlal.s16 q9, d10, d10 + vmlal.s16 q10, d11, d11 + vpadal.s16 q8, q6 + vmlal.s16 q9, d12, d12 + vmlal.s16 q10, d13, d13 + vpadal.s16 q8, q7 + vmlal.s16 q9, d14, d14 + vmlal.s16 q10, d15, d15 + + vpadal.s16 q8, q0 ;sum + vmlal.s16 q9, d0, d0 ;sse + vmlal.s16 q10, d1, d1 + vpadal.s16 q8, q1 + vmlal.s16 q9, d2, d2 + vmlal.s16 q10, d3, d3 + vpadal.s16 q8, q2 + vmlal.s16 q9, d4, d4 + vmlal.s16 q10, d5, d5 + vpadal.s16 q8, q3 + vmlal.s16 q9, d6, d6 + vmlal.s16 q10, d7, d7 + + bne vp8_filt_fpo16x16s_4_0_loop_neon + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [lr] ;store sse + vshr.u32 d10, d10, #8 + vsub.u32 d0, d1, d10 + + vmov.32 r0, d0[0] ;return + pop {pc} + ENDP + +;================================================ +;unsigned int vp8_variance_halfpixvar16x16_v_neon +;( +; unsigned char *src_ptr, r0 +; int src_pixels_per_line, r1 +; unsigned char *dst_ptr, r2 +; int dst_pixels_per_line, r3 +; unsigned int *sse +;); +;================================================ +|vp8_variance_halfpixvar16x16_v_neon| PROC + push {lr} + + mov r12, #4 ;loop counter + + vld1.u8 {q0}, [r0], r1 ;load src data + ldr lr, [sp, #4] ;load *sse from stack + + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + +vp8_filt_spo16x16s_0_4_loop_neon + vld1.u8 {q2}, [r0], r1 + vld1.8 {q1}, [r2], r3 + vld1.u8 {q4}, [r0], r1 + vld1.8 {q3}, [r2], r3 + vld1.u8 {q6}, [r0], r1 + vld1.8 {q5}, [r2], r3 + vld1.u8 {q15}, [r0], r1 + + vrhadd.u8 q0, q0, q2 + vld1.8 {q7}, [r2], r3 + vrhadd.u8 q2, q2, q4 + vrhadd.u8 q4, q4, q6 + vrhadd.u8 q6, q6, q15 + + vsubl.u8 q11, d0, d2 ;diff + vsubl.u8 q12, d1, d3 + vsubl.u8 q13, d4, d6 + vsubl.u8 q14, d5, d7 + vsubl.u8 q0, d8, d10 + vsubl.u8 q1, d9, d11 + vsubl.u8 q2, d12, d14 + vsubl.u8 q3, d13, d15 + + vpadal.s16 q8, q11 ;sum + vmlal.s16 q9, d22, d22 ;sse + vmlal.s16 q10, d23, d23 + + subs r12, r12, #1 + + vpadal.s16 q8, q12 + vmlal.s16 q9, d24, d24 + vmlal.s16 q10, d25, d25 + vpadal.s16 q8, q13 + vmlal.s16 q9, d26, d26 + vmlal.s16 q10, d27, d27 + vpadal.s16 q8, q14 + vmlal.s16 q9, d28, d28 + vmlal.s16 q10, d29, d29 + + vpadal.s16 q8, q0 ;sum + vmlal.s16 q9, d0, d0 ;sse + vmlal.s16 q10, d1, d1 + vpadal.s16 q8, q1 + vmlal.s16 q9, d2, d2 + vmlal.s16 q10, d3, d3 + vpadal.s16 q8, q2 + vmlal.s16 q9, d4, d4 + vmlal.s16 q10, d5, d5 + + vmov q0, q15 + + vpadal.s16 q8, q3 + vmlal.s16 q9, d6, d6 + vmlal.s16 q10, d7, d7 + + bne vp8_filt_spo16x16s_0_4_loop_neon + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [lr] ;store sse + vshr.u32 d10, d10, #8 + vsub.u32 d0, d1, d10 + + vmov.32 r0, d0[0] ;return + pop {pc} + ENDP + +;================================================ +;unsigned int vp8_variance_halfpixvar16x16_hv_neon +;( +; unsigned char *src_ptr, r0 +; int src_pixels_per_line, r1 +; unsigned char *dst_ptr, r2 +; int dst_pixels_per_line, r3 +; unsigned int *sse +;); +;================================================ +|vp8_variance_halfpixvar16x16_hv_neon| PROC + push {lr} + + vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data + + ldr lr, [sp, #4] ;load *sse from stack + vmov.i8 q13, #0 ;q8 - sum + vext.8 q1, q0, q1, #1 ;construct src_ptr[1] + + vmov.i8 q14, #0 ;q9, q10 - sse + vmov.i8 q15, #0 + + mov r12, #4 ;loop counter + vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 + +;First Pass: output_height lines x output_width columns (17x16) +vp8_filt16x16s_4_4_loop_neon + vld1.u8 {d4, d5, d6, d7}, [r0], r1 + vld1.u8 {d8, d9, d10, d11}, [r0], r1 + vld1.u8 {d12, d13, d14, d15}, [r0], r1 + vld1.u8 {d16, d17, d18, d19}, [r0], r1 + + ;pld [r0] + ;pld [r0, r1] + ;pld [r0, r1, lsl #1] + + vext.8 q3, q2, q3, #1 ;construct src_ptr[1] + vext.8 q5, q4, q5, #1 + vext.8 q7, q6, q7, #1 + vext.8 q9, q8, q9, #1 + + vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 + vrhadd.u8 q2, q4, q5 + vrhadd.u8 q3, q6, q7 + vrhadd.u8 q4, q8, q9 + + vld1.8 {q5}, [r2], r3 + vrhadd.u8 q0, q0, q1 + vld1.8 {q6}, [r2], r3 + vrhadd.u8 q1, q1, q2 + vld1.8 {q7}, [r2], r3 + vrhadd.u8 q2, q2, q3 + vld1.8 {q8}, [r2], r3 + vrhadd.u8 q3, q3, q4 + + vsubl.u8 q9, d0, d10 ;diff + vsubl.u8 q10, d1, d11 + vsubl.u8 q11, d2, d12 + vsubl.u8 q12, d3, d13 + + vsubl.u8 q0, d4, d14 ;diff + vsubl.u8 q1, d5, d15 + vsubl.u8 q5, d6, d16 + vsubl.u8 q6, d7, d17 + + vpadal.s16 q13, q9 ;sum + vmlal.s16 q14, d18, d18 ;sse + vmlal.s16 q15, d19, d19 + + vpadal.s16 q13, q10 ;sum + vmlal.s16 q14, d20, d20 ;sse + vmlal.s16 q15, d21, d21 + + vpadal.s16 q13, q11 ;sum + vmlal.s16 q14, d22, d22 ;sse + vmlal.s16 q15, d23, d23 + + vpadal.s16 q13, q12 ;sum + vmlal.s16 q14, d24, d24 ;sse + vmlal.s16 q15, d25, d25 + + subs r12, r12, #1 + + vpadal.s16 q13, q0 ;sum + vmlal.s16 q14, d0, d0 ;sse + vmlal.s16 q15, d1, d1 + + vpadal.s16 q13, q1 ;sum + vmlal.s16 q14, d2, d2 ;sse + vmlal.s16 q15, d3, d3 + + vpadal.s16 q13, q5 ;sum + vmlal.s16 q14, d10, d10 ;sse + vmlal.s16 q15, d11, d11 + + vmov q0, q4 + + vpadal.s16 q13, q6 ;sum + vmlal.s16 q14, d12, d12 ;sse + vmlal.s16 q15, d13, d13 + + bne vp8_filt16x16s_4_4_loop_neon + + vadd.u32 q15, q14, q15 ;accumulate sse + vpaddl.s32 q0, q13 ;accumulate sum + + vpaddl.u32 q1, q15 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [lr] ;store sse + vshr.u32 d10, d10, #8 + vsub.u32 d0, d1, d10 + + vmov.32 r0, d0[0] ;return + pop {pc} + ENDP + +;============================== +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; stack unsigned char *dst_ptr, +; stack int dst_pixels_per_line, +; stack unsigned int *sse +;note: in vp8_find_best_half_pixel_step()(called when 8dst.y_buffer; + unsigned char *ypred_ptr = x->predictor; + int y_stride = x->dst.y_stride; + int mode = x->mode_info_context->mbmi.mode; + int Up = x->up_available; + int Left = x->left_available; + + vp8_build_intra_predictors_mby_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left); +} + +extern void vp8_build_intra_predictors_mby_s_neon_func( + unsigned char *y_buffer, + unsigned char *ypred_ptr, + int y_stride, + int mode, + int Up, + int Left); + +void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x) +{ + unsigned char *y_buffer = x->dst.y_buffer; + unsigned char *ypred_ptr = x->predictor; + int y_stride = x->dst.y_stride; + int mode = x->mode_info_context->mbmi.mode; + int Up = x->up_available; + int Left = x->left_available; + + vp8_build_intra_predictors_mby_s_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left); +} + +#endif diff --git a/vp8/common/arm/variance_arm.c b/vp8/common/arm/variance_arm.c new file mode 100644 index 0000000000000000000000000000000000000000..891d767f06ab79fa915a02612d1904ba68a8f9ec --- /dev/null +++ b/vp8/common/arm/variance_arm.c @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vpx_rtcd.h" +#include "vp8/common/variance.h" +#include "vp8/common/filter.h" + +#if HAVE_MEDIA +#include "vp8/common/arm/bilinearfilter_arm.h" + +unsigned int vp8_sub_pixel_variance8x8_armv6 +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + unsigned short first_pass[10*8]; + unsigned char second_pass[8*8]; + const short *HFilter, *VFilter; + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass, + src_pixels_per_line, + 9, 8, HFilter); + vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass, + 8, 8, 8, VFilter); + + return vp8_variance8x8_armv6(second_pass, 8, dst_ptr, + dst_pixels_per_line, sse); +} + +unsigned int vp8_sub_pixel_variance16x16_armv6 +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + unsigned short first_pass[36*16]; + unsigned char second_pass[20*16]; + const short *HFilter, *VFilter; + unsigned int var; + + if (xoffset == 4 && yoffset == 0) + { + var = vp8_variance_halfpixvar16x16_h_armv6(src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, sse); + } + else if (xoffset == 0 && yoffset == 4) + { + var = vp8_variance_halfpixvar16x16_v_armv6(src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, sse); + } + else if (xoffset == 4 && yoffset == 4) + { + var = vp8_variance_halfpixvar16x16_hv_armv6(src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, sse); + } + else + { + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass, + src_pixels_per_line, + 17, 16, HFilter); + vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass, + 16, 16, 16, VFilter); + + var = vp8_variance16x16_armv6(second_pass, 16, dst_ptr, + dst_pixels_per_line, sse); + } + return var; +} + +#endif /* HAVE_MEDIA */ + + +#if HAVE_NEON + +extern unsigned int vp8_sub_pixel_variance16x16_neon_func +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +); + +unsigned int vp8_sub_pixel_variance16x16_neon +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + if (xoffset == 4 && yoffset == 0) + return vp8_variance_halfpixvar16x16_h_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); + else if (xoffset == 0 && yoffset == 4) + return vp8_variance_halfpixvar16x16_v_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); + else if (xoffset == 4 && yoffset == 4) + return vp8_variance_halfpixvar16x16_hv_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); + else + return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); +} + +#endif diff --git a/vp8/common/asm_com_offsets.c b/vp8/common/asm_com_offsets.c new file mode 100644 index 0000000000000000000000000000000000000000..ae22b5f6bd2dad1b92d9e764b24959373e640a21 --- /dev/null +++ b/vp8/common/asm_com_offsets.c @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vpx/vpx_codec.h" +#include "vpx_ports/asm_offsets.h" +#include "vpx_scale/yv12config.h" +#include "vp8/common/blockd.h" + +#if CONFIG_POSTPROC +#include "postproc.h" +#endif /* CONFIG_POSTPROC */ + +BEGIN + +/* vpx_scale */ +DEFINE(yv12_buffer_config_y_width, offsetof(YV12_BUFFER_CONFIG, y_width)); +DEFINE(yv12_buffer_config_y_height, offsetof(YV12_BUFFER_CONFIG, y_height)); +DEFINE(yv12_buffer_config_y_stride, offsetof(YV12_BUFFER_CONFIG, y_stride)); +DEFINE(yv12_buffer_config_uv_width, offsetof(YV12_BUFFER_CONFIG, uv_width)); +DEFINE(yv12_buffer_config_uv_height, offsetof(YV12_BUFFER_CONFIG, uv_height)); +DEFINE(yv12_buffer_config_uv_stride, offsetof(YV12_BUFFER_CONFIG, uv_stride)); +DEFINE(yv12_buffer_config_y_buffer, offsetof(YV12_BUFFER_CONFIG, y_buffer)); +DEFINE(yv12_buffer_config_u_buffer, offsetof(YV12_BUFFER_CONFIG, u_buffer)); +DEFINE(yv12_buffer_config_v_buffer, offsetof(YV12_BUFFER_CONFIG, v_buffer)); +DEFINE(yv12_buffer_config_border, offsetof(YV12_BUFFER_CONFIG, border)); +DEFINE(VP8BORDERINPIXELS_VAL, VP8BORDERINPIXELS); + +#if CONFIG_POSTPROC +/* mfqe.c / filter_by_weight */ +DEFINE(MFQE_PRECISION_VAL, MFQE_PRECISION); +#endif /* CONFIG_POSTPROC */ + +END + +/* add asserts for any offset that is not supported by assembly code */ +/* add asserts for any size that is not supported by assembly code */ + +#if HAVE_MEDIA +/* switch case in vp8_intra4x4_predict_armv6 is based on these enumerated values */ +ct_assert(B_DC_PRED, B_DC_PRED == 0); +ct_assert(B_TM_PRED, B_TM_PRED == 1); +ct_assert(B_VE_PRED, B_VE_PRED == 2); +ct_assert(B_HE_PRED, B_HE_PRED == 3); +ct_assert(B_LD_PRED, B_LD_PRED == 4); +ct_assert(B_RD_PRED, B_RD_PRED == 5); +ct_assert(B_VR_PRED, B_VR_PRED == 6); +ct_assert(B_VL_PRED, B_VL_PRED == 7); +ct_assert(B_HD_PRED, B_HD_PRED == 8); +ct_assert(B_HU_PRED, B_HU_PRED == 9); +#endif + +#if HAVE_NEON +/* vp8_yv12_extend_frame_borders_neon makes several assumptions based on this */ +ct_assert(VP8BORDERINPIXELS_VAL, VP8BORDERINPIXELS == 32) +#endif + +#if HAVE_SSE2 +#if CONFIG_POSTPROC +/* vp8_filter_by_weight16x16 and 8x8 */ +ct_assert(MFQE_PRECISION_VAL, MFQE_PRECISION == 4) +#endif /* CONFIG_POSTPROC */ +#endif /* HAVE_SSE2 */ diff --git a/vp8/common/blockd.c b/vp8/common/blockd.c new file mode 100644 index 0000000000000000000000000000000000000000..1fc3cd0ca7cb6de9f838a7ad7849b90c6fbb73f4 --- /dev/null +++ b/vp8/common/blockd.c @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "blockd.h" +#include "vpx_mem/vpx_mem.h" + +const unsigned char vp8_block2left[25] = +{ + 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +}; +const unsigned char vp8_block2above[25] = +{ + 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8 +}; diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h new file mode 100644 index 0000000000000000000000000000000000000000..f7ff57763526c104238fb97ee23aaca740d6e7b6 --- /dev/null +++ b/vp8/common/blockd.h @@ -0,0 +1,300 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef __INC_BLOCKD_H +#define __INC_BLOCKD_H + +void vpx_log(const char *format, ...); + +#include "vpx_config.h" +#include "vpx_scale/yv12config.h" +#include "mv.h" +#include "treecoder.h" +#include "vpx_ports/mem.h" + +/*#define DCPRED 1*/ +#define DCPREDSIMTHRESH 0 +#define DCPREDCNTTHRESH 3 + +#define MB_FEATURE_TREE_PROBS 3 +#define MAX_MB_SEGMENTS 4 + +#define MAX_REF_LF_DELTAS 4 +#define MAX_MODE_LF_DELTAS 4 + +/* Segment Feature Masks */ +#define SEGMENT_DELTADATA 0 +#define SEGMENT_ABSDATA 1 + +typedef struct +{ + int r, c; +} POS; + +#define PLANE_TYPE_Y_NO_DC 0 +#define PLANE_TYPE_Y2 1 +#define PLANE_TYPE_UV 2 +#define PLANE_TYPE_Y_WITH_DC 3 + + +typedef char ENTROPY_CONTEXT; +typedef struct +{ + ENTROPY_CONTEXT y1[4]; + ENTROPY_CONTEXT u[2]; + ENTROPY_CONTEXT v[2]; + ENTROPY_CONTEXT y2; +} ENTROPY_CONTEXT_PLANES; + +extern const unsigned char vp8_block2left[25]; +extern const unsigned char vp8_block2above[25]; + +#define VP8_COMBINEENTROPYCONTEXTS( Dest, A, B) \ + Dest = (A)+(B); + + +typedef enum +{ + KEY_FRAME = 0, + INTER_FRAME = 1 +} FRAME_TYPE; + +typedef enum +{ + DC_PRED, /* average of above and left pixels */ + V_PRED, /* vertical prediction */ + H_PRED, /* horizontal prediction */ + TM_PRED, /* Truemotion prediction */ + B_PRED, /* block based prediction, each block has its own prediction mode */ + + NEARESTMV, + NEARMV, + ZEROMV, + NEWMV, + SPLITMV, + + MB_MODE_COUNT +} MB_PREDICTION_MODE; + +/* Macroblock level features */ +typedef enum +{ + MB_LVL_ALT_Q = 0, /* Use alternate Quantizer .... */ + MB_LVL_ALT_LF = 1, /* Use alternate loop filter value... */ + MB_LVL_MAX = 2 /* Number of MB level features supported */ + +} MB_LVL_FEATURES; + +/* Segment Feature Masks */ +#define SEGMENT_ALTQ 0x01 +#define SEGMENT_ALT_LF 0x02 + +#define VP8_YMODES (B_PRED + 1) +#define VP8_UV_MODES (TM_PRED + 1) + +#define VP8_MVREFS (1 + SPLITMV - NEARESTMV) + +typedef enum +{ + B_DC_PRED, /* average of above and left pixels */ + B_TM_PRED, + + B_VE_PRED, /* vertical prediction */ + B_HE_PRED, /* horizontal prediction */ + + B_LD_PRED, + B_RD_PRED, + + B_VR_PRED, + B_VL_PRED, + B_HD_PRED, + B_HU_PRED, + + LEFT4X4, + ABOVE4X4, + ZERO4X4, + NEW4X4, + + B_MODE_COUNT +} B_PREDICTION_MODE; + +#define VP8_BINTRAMODES (B_HU_PRED + 1) /* 10 */ +#define VP8_SUBMVREFS (1 + NEW4X4 - LEFT4X4) + +/* For keyframes, intra block modes are predicted by the (already decoded) + modes for the Y blocks to the left and above us; for interframes, there + is a single probability table. */ + +union b_mode_info +{ + B_PREDICTION_MODE as_mode; + int_mv mv; +}; + +typedef enum +{ + INTRA_FRAME = 0, + LAST_FRAME = 1, + GOLDEN_FRAME = 2, + ALTREF_FRAME = 3, + MAX_REF_FRAMES = 4 +} MV_REFERENCE_FRAME; + +typedef struct +{ + uint8_t mode, uv_mode; + uint8_t ref_frame; + uint8_t is_4x4; + int_mv mv; + + uint8_t partitioning; + uint8_t mb_skip_coeff; /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */ + uint8_t need_to_clamp_mvs; + uint8_t segment_id; /* Which set of segmentation parameters should be used for this MB */ +} MB_MODE_INFO; + +typedef struct modeinfo +{ + MB_MODE_INFO mbmi; + union b_mode_info bmi[16]; +} MODE_INFO; + +#if CONFIG_MULTI_RES_ENCODING +/* The mb-level information needed to be stored for higher-resolution encoder */ +typedef struct +{ + MB_PREDICTION_MODE mode; + MV_REFERENCE_FRAME ref_frame; + int_mv mv; + int dissim; /* dissimilarity level of the macroblock */ +} LOWER_RES_MB_INFO; + +/* The frame-level information needed to be stored for higher-resolution + * encoder */ +typedef struct +{ + FRAME_TYPE frame_type; + int is_frame_dropped; + /* The frame number of each reference frames */ + unsigned int low_res_ref_frames[MAX_REF_FRAMES]; + LOWER_RES_MB_INFO *mb_info; +} LOWER_RES_FRAME_INFO; +#endif + +typedef struct blockd +{ + short *qcoeff; + short *dqcoeff; + unsigned char *predictor; + short *dequant; + + int offset; + char *eob; + + union b_mode_info bmi; +} BLOCKD; + +typedef void (*vp8_subpix_fn_t)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +typedef struct macroblockd +{ + DECLARE_ALIGNED(16, unsigned char, predictor[384]); + DECLARE_ALIGNED(16, short, qcoeff[400]); + DECLARE_ALIGNED(16, short, dqcoeff[400]); + DECLARE_ALIGNED(16, char, eobs[25]); + + DECLARE_ALIGNED(16, short, dequant_y1[16]); + DECLARE_ALIGNED(16, short, dequant_y1_dc[16]); + DECLARE_ALIGNED(16, short, dequant_y2[16]); + DECLARE_ALIGNED(16, short, dequant_uv[16]); + + /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */ + BLOCKD block[25]; + int fullpixel_mask; + + YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */ + YV12_BUFFER_CONFIG dst; + + MODE_INFO *mode_info_context; + int mode_info_stride; + + FRAME_TYPE frame_type; + + int up_available; + int left_available; + + unsigned char *recon_above[3]; + unsigned char *recon_left[3]; + int recon_left_stride[2]; + + /* Y,U,V,Y2 */ + ENTROPY_CONTEXT_PLANES *above_context; + ENTROPY_CONTEXT_PLANES *left_context; + + /* 0 indicates segmentation at MB level is not enabled. Otherwise the individual bits indicate which features are active. */ + unsigned char segmentation_enabled; + + /* 0 (do not update) 1 (update) the macroblock segmentation map. */ + unsigned char update_mb_segmentation_map; + + /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */ + unsigned char update_mb_segmentation_data; + + /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */ + unsigned char mb_segement_abs_delta; + + /* Per frame flags that define which MB level features (such as quantizer or loop filter level) */ + /* are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO */ + vp8_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS]; /* Probability Tree used to code Segment number */ + + signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS]; /* Segment parameters */ + + /* mode_based Loop filter adjustment */ + unsigned char mode_ref_lf_delta_enabled; + unsigned char mode_ref_lf_delta_update; + + /* Delta values have the range +/- MAX_LOOP_FILTER */ + signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS]; /* 0 = Intra, Last, GF, ARF */ + signed char ref_lf_deltas[MAX_REF_LF_DELTAS]; /* 0 = Intra, Last, GF, ARF */ + signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS]; /* 0 = BPRED, ZERO_MV, MV, SPLIT */ + signed char mode_lf_deltas[MAX_MODE_LF_DELTAS]; /* 0 = BPRED, ZERO_MV, MV, SPLIT */ + + /* Distance of MB away from frame edges */ + int mb_to_left_edge; + int mb_to_right_edge; + int mb_to_top_edge; + int mb_to_bottom_edge; + + + + vp8_subpix_fn_t subpixel_predict; + vp8_subpix_fn_t subpixel_predict8x4; + vp8_subpix_fn_t subpixel_predict8x8; + vp8_subpix_fn_t subpixel_predict16x16; + + void *current_bc; + + int corrupted; + +#if ARCH_X86 || ARCH_X86_64 + /* This is an intermediate buffer currently used in sub-pixel motion search + * to keep a copy of the reference area. This buffer can be used for other + * purpose. + */ + DECLARE_ALIGNED(32, unsigned char, y_buf[22*32]); +#endif +} MACROBLOCKD; + + +extern void vp8_build_block_doffsets(MACROBLOCKD *x); +extern void vp8_setup_block_dptrs(MACROBLOCKD *x); + +#endif /* __INC_BLOCKD_H */ diff --git a/vp8/common/coefupdateprobs.h b/vp8/common/coefupdateprobs.h new file mode 100644 index 0000000000000000000000000000000000000000..9e194dc9a4dc829fa4564e7b9cda1c2c024df200 --- /dev/null +++ b/vp8/common/coefupdateprobs.h @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +/* Update probabilities for the nodes in the token entropy tree. + Generated file included by entropy.c */ + +const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES] = +{ + { + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255, }, + {249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255, }, + {234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255, }, + {250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + }, + { + { + {217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255, }, + {234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255, }, + }, + { + {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + }, + { + { + {186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255, }, + {234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255, }, + {251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255, }, + }, + { + {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + }, + { + { + {248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255, }, + {248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255, }, + {248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + }, +}; diff --git a/vp8/common/common.h b/vp8/common/common.h new file mode 100644 index 0000000000000000000000000000000000000000..2cc1c544cdfe681a6c42234b4d3bedda2a400e61 --- /dev/null +++ b/vp8/common/common.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef common_h +#define common_h 1 + +#include + +/* Interface header for common constant data structures and lookup tables */ + +#include "vpx_mem/vpx_mem.h" + +/* Only need this for fixed-size arrays, for structs just assign. */ + +#define vp8_copy( Dest, Src) { \ + assert( sizeof( Dest) == sizeof( Src)); \ + vpx_memcpy( Dest, Src, sizeof( Src)); \ + } + +/* Use this for variably-sized arrays. */ + +#define vp8_copy_array( Dest, Src, N) { \ + assert( sizeof( *Dest) == sizeof( *Src)); \ + vpx_memcpy( Dest, Src, N * sizeof( *Src)); \ + } + +#define vp8_zero( Dest) vpx_memset( &Dest, 0, sizeof( Dest)); + +#define vp8_zero_array( Dest, N) vpx_memset( Dest, 0, N * sizeof( *Dest)); + + +#endif /* common_h */ diff --git a/vp8/common/context.c b/vp8/common/context.c new file mode 100644 index 0000000000000000000000000000000000000000..99e95d30ffe4c8f6d5cef1fd01193465ac5a8057 --- /dev/null +++ b/vp8/common/context.c @@ -0,0 +1,399 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "entropy.h" + +/* *** GENERATED FILE: DO NOT EDIT *** */ + +#if 0 +int Contexts[vp8_coef_counter_dimen]; + +const int default_contexts[vp8_coef_counter_dimen] = +{ + { + // Block Type ( 0 ) + { + // Coeff Band ( 0 ) + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + }, + { + // Coeff Band ( 1 ) + {30190, 26544, 225, 24, 4, 0, 0, 0, 0, 0, 0, 4171593,}, + {26846, 25157, 1241, 130, 26, 6, 1, 0, 0, 0, 0, 149987,}, + {10484, 9538, 1006, 160, 36, 18, 0, 0, 0, 0, 0, 15104,}, + }, + { + // Coeff Band ( 2 ) + {25842, 40456, 1126, 83, 11, 2, 0, 0, 0, 0, 0, 0,}, + {9338, 8010, 512, 73, 7, 3, 2, 0, 0, 0, 0, 43294,}, + {1047, 751, 149, 31, 13, 6, 1, 0, 0, 0, 0, 879,}, + }, + { + // Coeff Band ( 3 ) + {26136, 9826, 252, 13, 0, 0, 0, 0, 0, 0, 0, 0,}, + {8134, 5574, 191, 14, 2, 0, 0, 0, 0, 0, 0, 35302,}, + { 605, 677, 116, 9, 1, 0, 0, 0, 0, 0, 0, 611,}, + }, + { + // Coeff Band ( 4 ) + {10263, 15463, 283, 17, 0, 0, 0, 0, 0, 0, 0, 0,}, + {2773, 2191, 128, 9, 2, 2, 0, 0, 0, 0, 0, 10073,}, + { 134, 125, 32, 4, 0, 2, 0, 0, 0, 0, 0, 50,}, + }, + { + // Coeff Band ( 5 ) + {10483, 2663, 23, 1, 0, 0, 0, 0, 0, 0, 0, 0,}, + {2137, 1251, 27, 1, 1, 0, 0, 0, 0, 0, 0, 14362,}, + { 116, 156, 14, 2, 1, 0, 0, 0, 0, 0, 0, 190,}, + }, + { + // Coeff Band ( 6 ) + {40977, 27614, 412, 28, 0, 0, 0, 0, 0, 0, 0, 0,}, + {6113, 5213, 261, 22, 3, 0, 0, 0, 0, 0, 0, 26164,}, + { 382, 312, 50, 14, 2, 0, 0, 0, 0, 0, 0, 345,}, + }, + { + // Coeff Band ( 7 ) + { 0, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 319,}, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8,}, + }, + }, + { + // Block Type ( 1 ) + { + // Coeff Band ( 0 ) + {3268, 19382, 1043, 250, 93, 82, 49, 26, 17, 8, 25, 82289,}, + {8758, 32110, 5436, 1832, 827, 668, 420, 153, 24, 0, 3, 52914,}, + {9337, 23725, 8487, 3954, 2107, 1836, 1069, 399, 59, 0, 0, 18620,}, + }, + { + // Coeff Band ( 1 ) + {12419, 8420, 452, 62, 9, 1, 0, 0, 0, 0, 0, 0,}, + {11715, 8705, 693, 92, 15, 7, 2, 0, 0, 0, 0, 53988,}, + {7603, 8585, 2306, 778, 270, 145, 39, 5, 0, 0, 0, 9136,}, + }, + { + // Coeff Band ( 2 ) + {15938, 14335, 1207, 184, 55, 13, 4, 1, 0, 0, 0, 0,}, + {7415, 6829, 1138, 244, 71, 26, 7, 0, 0, 0, 0, 9980,}, + {1580, 1824, 655, 241, 89, 46, 10, 2, 0, 0, 0, 429,}, + }, + { + // Coeff Band ( 3 ) + {19453, 5260, 201, 19, 0, 0, 0, 0, 0, 0, 0, 0,}, + {9173, 3758, 213, 22, 1, 1, 0, 0, 0, 0, 0, 9820,}, + {1689, 1277, 276, 51, 17, 4, 0, 0, 0, 0, 0, 679,}, + }, + { + // Coeff Band ( 4 ) + {12076, 10667, 620, 85, 19, 9, 5, 0, 0, 0, 0, 0,}, + {4665, 3625, 423, 55, 19, 9, 0, 0, 0, 0, 0, 5127,}, + { 415, 440, 143, 34, 20, 7, 2, 0, 0, 0, 0, 101,}, + }, + { + // Coeff Band ( 5 ) + {12183, 4846, 115, 11, 1, 0, 0, 0, 0, 0, 0, 0,}, + {4226, 3149, 177, 21, 2, 0, 0, 0, 0, 0, 0, 7157,}, + { 375, 621, 189, 51, 11, 4, 1, 0, 0, 0, 0, 198,}, + }, + { + // Coeff Band ( 6 ) + {61658, 37743, 1203, 94, 10, 3, 0, 0, 0, 0, 0, 0,}, + {15514, 11563, 903, 111, 14, 5, 0, 0, 0, 0, 0, 25195,}, + { 929, 1077, 291, 78, 14, 7, 1, 0, 0, 0, 0, 507,}, + }, + { + // Coeff Band ( 7 ) + { 0, 990, 15, 3, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 0, 412, 13, 0, 0, 0, 0, 0, 0, 0, 0, 1641,}, + { 0, 18, 7, 1, 0, 0, 0, 0, 0, 0, 0, 30,}, + }, + }, + { + // Block Type ( 2 ) + { + // Coeff Band ( 0 ) + { 953, 24519, 628, 120, 28, 12, 4, 0, 0, 0, 0, 2248798,}, + {1525, 25654, 2647, 617, 239, 143, 42, 5, 0, 0, 0, 66837,}, + {1180, 11011, 3001, 1237, 532, 448, 239, 54, 5, 0, 0, 7122,}, + }, + { + // Coeff Band ( 1 ) + {1356, 2220, 67, 10, 4, 1, 0, 0, 0, 0, 0, 0,}, + {1450, 2544, 102, 18, 4, 3, 0, 0, 0, 0, 0, 57063,}, + {1182, 2110, 470, 130, 41, 21, 0, 0, 0, 0, 0, 6047,}, + }, + { + // Coeff Band ( 2 ) + { 370, 3378, 200, 30, 5, 4, 1, 0, 0, 0, 0, 0,}, + { 293, 1006, 131, 29, 11, 0, 0, 0, 0, 0, 0, 5404,}, + { 114, 387, 98, 23, 4, 8, 1, 0, 0, 0, 0, 236,}, + }, + { + // Coeff Band ( 3 ) + { 579, 194, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 395, 213, 5, 1, 0, 0, 0, 0, 0, 0, 0, 4157,}, + { 119, 122, 4, 0, 0, 0, 0, 0, 0, 0, 0, 300,}, + }, + { + // Coeff Band ( 4 ) + { 38, 557, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 21, 114, 12, 1, 0, 0, 0, 0, 0, 0, 0, 427,}, + { 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7,}, + }, + { + // Coeff Band ( 5 ) + { 52, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 18, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 652,}, + { 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30,}, + }, + { + // Coeff Band ( 6 ) + { 640, 569, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 25, 77, 2, 0, 0, 0, 0, 0, 0, 0, 0, 517,}, + { 4, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3,}, + }, + { + // Coeff Band ( 7 ) + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + }, + }, + { + // Block Type ( 3 ) + { + // Coeff Band ( 0 ) + {2506, 20161, 2707, 767, 261, 178, 107, 30, 14, 3, 0, 100694,}, + {8806, 36478, 8817, 3268, 1280, 850, 401, 114, 42, 0, 0, 58572,}, + {11003, 27214, 11798, 5716, 2482, 2072, 1048, 175, 32, 0, 0, 19284,}, + }, + { + // Coeff Band ( 1 ) + {9738, 11313, 959, 205, 70, 18, 11, 1, 0, 0, 0, 0,}, + {12628, 15085, 1507, 273, 52, 19, 9, 0, 0, 0, 0, 54280,}, + {10701, 15846, 5561, 1926, 813, 570, 249, 36, 0, 0, 0, 6460,}, + }, + { + // Coeff Band ( 2 ) + {6781, 22539, 2784, 634, 182, 123, 20, 4, 0, 0, 0, 0,}, + {6263, 11544, 2649, 790, 259, 168, 27, 5, 0, 0, 0, 20539,}, + {3109, 4075, 2031, 896, 457, 386, 158, 29, 0, 0, 0, 1138,}, + }, + { + // Coeff Band ( 3 ) + {11515, 4079, 465, 73, 5, 14, 2, 0, 0, 0, 0, 0,}, + {9361, 5834, 650, 96, 24, 8, 4, 0, 0, 0, 0, 22181,}, + {4343, 3974, 1360, 415, 132, 96, 14, 1, 0, 0, 0, 1267,}, + }, + { + // Coeff Band ( 4 ) + {4787, 9297, 823, 168, 44, 12, 4, 0, 0, 0, 0, 0,}, + {3619, 4472, 719, 198, 60, 31, 3, 0, 0, 0, 0, 8401,}, + {1157, 1175, 483, 182, 88, 31, 8, 0, 0, 0, 0, 268,}, + }, + { + // Coeff Band ( 5 ) + {8299, 1226, 32, 5, 1, 0, 0, 0, 0, 0, 0, 0,}, + {3502, 1568, 57, 4, 1, 1, 0, 0, 0, 0, 0, 9811,}, + {1055, 1070, 166, 29, 6, 1, 0, 0, 0, 0, 0, 527,}, + }, + { + // Coeff Band ( 6 ) + {27414, 27927, 1989, 347, 69, 26, 0, 0, 0, 0, 0, 0,}, + {5876, 10074, 1574, 341, 91, 24, 4, 0, 0, 0, 0, 21954,}, + {1571, 2171, 778, 324, 124, 65, 16, 0, 0, 0, 0, 979,}, + }, + { + // Coeff Band ( 7 ) + { 0, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 0, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 459,}, + { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13,}, + }, + }, +}; + +//Update probabilities for the nodes in the token entropy tree. +const vp8_prob tree_update_probs[vp8_coef_tree_dimen] = +{ + { + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255, }, + {249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255, }, + {234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255, }, + {250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + }, + { + { + {217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255, }, + {234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255, }, + }, + { + {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + }, + { + { + {186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255, }, + {234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255, }, + {251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255, }, + }, + { + {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + }, + { + { + {248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255, }, + {248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255, }, + {248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + }, +}; +#endif diff --git a/vp8/common/debugmodes.c b/vp8/common/debugmodes.c new file mode 100644 index 0000000000000000000000000000000000000000..46064e61d536d1e6212fcb40ebea90cab9350eee --- /dev/null +++ b/vp8/common/debugmodes.c @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include +#include "blockd.h" + + +void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int frame) +{ + + int mb_row; + int mb_col; + int mb_index = 0; + FILE *mvs = fopen("mvs.stt", "a"); + + /* print out the macroblock Y modes */ + mb_index = 0; + fprintf(mvs, "Mb Modes for Frame %d\n", frame); + + for (mb_row = 0; mb_row < rows; mb_row++) + { + for (mb_col = 0; mb_col < cols; mb_col++) + { + + fprintf(mvs, "%2d ", mi[mb_index].mbmi.mode); + + mb_index++; + } + + fprintf(mvs, "\n"); + mb_index++; + } + + fprintf(mvs, "\n"); + + mb_index = 0; + fprintf(mvs, "Mb mv ref for Frame %d\n", frame); + + for (mb_row = 0; mb_row < rows; mb_row++) + { + for (mb_col = 0; mb_col < cols; mb_col++) + { + + fprintf(mvs, "%2d ", mi[mb_index].mbmi.ref_frame); + + mb_index++; + } + + fprintf(mvs, "\n"); + mb_index++; + } + + fprintf(mvs, "\n"); + + /* print out the macroblock UV modes */ + mb_index = 0; + fprintf(mvs, "UV Modes for Frame %d\n", frame); + + for (mb_row = 0; mb_row < rows; mb_row++) + { + for (mb_col = 0; mb_col < cols; mb_col++) + { + + fprintf(mvs, "%2d ", mi[mb_index].mbmi.uv_mode); + + mb_index++; + } + + mb_index++; + fprintf(mvs, "\n"); + } + + fprintf(mvs, "\n"); + + /* print out the block modes */ + mb_index = 0; + fprintf(mvs, "Mbs for Frame %d\n", frame); + { + int b_row; + + for (b_row = 0; b_row < 4 * rows; b_row++) + { + int b_col; + int bindex; + + for (b_col = 0; b_col < 4 * cols; b_col++) + { + mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2); + bindex = (b_row & 3) * 4 + (b_col & 3); + + if (mi[mb_index].mbmi.mode == B_PRED) + fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode); + else + fprintf(mvs, "xx "); + + } + + fprintf(mvs, "\n"); + } + } + fprintf(mvs, "\n"); + + /* print out the macroblock mvs */ + mb_index = 0; + fprintf(mvs, "MVs for Frame %d\n", frame); + + for (mb_row = 0; mb_row < rows; mb_row++) + { + for (mb_col = 0; mb_col < cols; mb_col++) + { + fprintf(mvs, "%5d:%-5d", mi[mb_index].mbmi.mv.as_mv.row / 2, mi[mb_index].mbmi.mv.as_mv.col / 2); + + mb_index++; + } + + mb_index++; + fprintf(mvs, "\n"); + } + + fprintf(mvs, "\n"); + + + /* print out the block modes */ + mb_index = 0; + fprintf(mvs, "MVs for Frame %d\n", frame); + { + int b_row; + + for (b_row = 0; b_row < 4 * rows; b_row++) + { + int b_col; + int bindex; + + for (b_col = 0; b_col < 4 * cols; b_col++) + { + mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2); + bindex = (b_row & 3) * 4 + (b_col & 3); + fprintf(mvs, "%3d:%-3d ", mi[mb_index].bmi[bindex].mv.as_mv.row, mi[mb_index].bmi[bindex].mv.as_mv.col); + + } + + fprintf(mvs, "\n"); + } + } + fprintf(mvs, "\n"); + + + fclose(mvs); +} diff --git a/vp8/common/default_coef_probs.h b/vp8/common/default_coef_probs.h new file mode 100644 index 0000000000000000000000000000000000000000..0d195636bcb70b8b2f372ffa91cf5c35b7b9ecca --- /dev/null +++ b/vp8/common/default_coef_probs.h @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. +*/ + + +/*Generated file, included by entropy.c*/ + + +static const vp8_prob default_coef_probs [BLOCK_TYPES] + [COEF_BANDS] + [PREV_COEF_CONTEXTS] + [ENTROPY_NODES] = +{ + { /* Block Type ( 0 ) */ + { /* Coeff Band ( 0 )*/ + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } + }, + { /* Coeff Band ( 1 )*/ + { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 }, + { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 }, + { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 } + }, + { /* Coeff Band ( 2 )*/ + { 1, 98, 248, 255, 236, 226, 255, 255, 128, 128, 128 }, + { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 }, + { 78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 } + }, + { /* Coeff Band ( 3 )*/ + { 1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 }, + { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 }, + { 77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 } + }, + { /* Coeff Band ( 4 )*/ + { 1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 }, + { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 }, + { 37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 } + }, + { /* Coeff Band ( 5 )*/ + { 1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 }, + { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 }, + { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 } + }, + { /* Coeff Band ( 6 )*/ + { 1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 }, + { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 }, + { 80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 } + }, + { /* Coeff Band ( 7 )*/ + { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 246, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } + } + }, + { /* Block Type ( 1 ) */ + { /* Coeff Band ( 0 )*/ + { 198, 35, 237, 223, 193, 187, 162, 160, 145, 155, 62 }, + { 131, 45, 198, 221, 172, 176, 220, 157, 252, 221, 1 }, + { 68, 47, 146, 208, 149, 167, 221, 162, 255, 223, 128 } + }, + { /* Coeff Band ( 1 )*/ + { 1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 }, + { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 }, + { 81, 99, 181, 242, 176, 190, 249, 202, 255, 255, 128 } + }, + { /* Coeff Band ( 2 )*/ + { 1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 }, + { 99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 }, + { 23, 91, 163, 242, 170, 187, 247, 210, 255, 255, 128 } + }, + { /* Coeff Band ( 3 )*/ + { 1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 }, + { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 }, + { 44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 } + }, + { /* Coeff Band ( 4 )*/ + { 1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 }, + { 94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 }, + { 22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 } + }, + { /* Coeff Band ( 5 )*/ + { 1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 }, + { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 }, + { 35, 77, 181, 251, 193, 211, 255, 205, 128, 128, 128 } + }, + { /* Coeff Band ( 6 )*/ + { 1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 }, + { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 }, + { 45, 99, 188, 251, 195, 217, 255, 224, 128, 128, 128 } + }, + { /* Coeff Band ( 7 )*/ + { 1, 1, 251, 255, 213, 255, 128, 128, 128, 128, 128 }, + { 203, 1, 248, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 } + } + }, + { /* Block Type ( 2 ) */ + { /* Coeff Band ( 0 )*/ + { 253, 9, 248, 251, 207, 208, 255, 192, 128, 128, 128 }, + { 175, 13, 224, 243, 193, 185, 249, 198, 255, 255, 128 }, + { 73, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 } + }, + { /* Coeff Band ( 1 )*/ + { 1, 95, 247, 253, 212, 183, 255, 255, 128, 128, 128 }, + { 239, 90, 244, 250, 211, 209, 255, 255, 128, 128, 128 }, + { 155, 77, 195, 248, 188, 195, 255, 255, 128, 128, 128 } + }, + { /* Coeff Band ( 2 )*/ + { 1, 24, 239, 251, 218, 219, 255, 205, 128, 128, 128 }, + { 201, 51, 219, 255, 196, 186, 128, 128, 128, 128, 128 }, + { 69, 46, 190, 239, 201, 218, 255, 228, 128, 128, 128 } + }, + { /* Coeff Band ( 3 )*/ + { 1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 }, + { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 } + }, + { /* Coeff Band ( 4 )*/ + { 1, 16, 248, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 190, 36, 230, 255, 236, 255, 128, 128, 128, 128, 128 }, + { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 } + }, + { /* Coeff Band ( 5 )*/ + { 1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 } + }, + { /* Coeff Band ( 6 )*/ + { 1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 213, 62, 250, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 55, 93, 255, 128, 128, 128, 128, 128, 128, 128, 128 } + }, + { /* Coeff Band ( 7 )*/ + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } + } + }, + { /* Block Type ( 3 ) */ + { /* Coeff Band ( 0 )*/ + { 202, 24, 213, 235, 186, 191, 220, 160, 240, 175, 255 }, + { 126, 38, 182, 232, 169, 184, 228, 174, 255, 187, 128 }, + { 61, 46, 138, 219, 151, 178, 240, 170, 255, 216, 128 } + }, + { /* Coeff Band ( 1 )*/ + { 1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 }, + { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 }, + { 39, 77, 162, 232, 172, 180, 245, 178, 255, 255, 128 } + }, + { /* Coeff Band ( 2 )*/ + { 1, 52, 220, 246, 198, 199, 249, 220, 255, 255, 128 }, + { 124, 74, 191, 243, 183, 193, 250, 221, 255, 255, 128 }, + { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 } + }, + { /* Coeff Band ( 3 )*/ + { 1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 }, + { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 }, + { 28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 } + }, + { /* Coeff Band ( 4 )*/ + { 1, 81, 230, 252, 204, 203, 255, 192, 128, 128, 128 }, + { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 }, + { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 } + }, + { /* Coeff Band ( 5 )*/ + { 1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 }, + { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 }, + { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 } + }, + { /* Coeff Band ( 6 )*/ + { 1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 }, + { 141, 84, 213, 252, 201, 202, 255, 219, 128, 128, 128 }, + { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 } + }, + { /* Coeff Band ( 7 )*/ + { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 244, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 } + } + } +}; diff --git a/vp8/common/dequantize.c b/vp8/common/dequantize.c new file mode 100644 index 0000000000000000000000000000000000000000..8eda48623a572371548cf23c74aa7afa31b742e6 --- /dev/null +++ b/vp8/common/dequantize.c @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vpx_rtcd.h" +#include "vp8/common/blockd.h" +#include "vpx_mem/vpx_mem.h" + +void vp8_dequantize_b_c(BLOCKD *d, short *DQC) +{ + int i; + short *DQ = d->dqcoeff; + short *Q = d->qcoeff; + + for (i = 0; i < 16; i++) + { + DQ[i] = Q[i] * DQC[i]; + } +} + +void vp8_dequant_idct_add_c(short *input, short *dq, + unsigned char *dest, int stride) +{ + int i; + + for (i = 0; i < 16; i++) + { + input[i] = dq[i] * input[i]; + } + + vp8_short_idct4x4llm_c(input, dest, stride, dest, stride); + + vpx_memset(input, 0, 32); + +} diff --git a/vp8/common/entropy.c b/vp8/common/entropy.c new file mode 100644 index 0000000000000000000000000000000000000000..8c046a4f57cec94b4122d4e2b1c49a36ad689cd9 --- /dev/null +++ b/vp8/common/entropy.c @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "entropy.h" +#include "blockd.h" +#include "onyxc_int.h" +#include "vpx_mem/vpx_mem.h" + +#include "coefupdateprobs.h" + +DECLARE_ALIGNED(16, const unsigned char, vp8_norm[256]) = +{ + 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]) = +{ 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7}; + +DECLARE_ALIGNED(16, const unsigned char, + vp8_prev_token_class[MAX_ENTROPY_TOKENS]) = +{ 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0}; + +DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]) = +{ + 0, 1, 4, 8, + 5, 2, 3, 6, + 9, 12, 13, 10, + 7, 11, 14, 15, +}; + +DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]) = +{ + 1, 2, 6, 7, + 3, 5, 8, 13, + 4, 9, 12, 14, + 10, 11, 15, 16 +}; + +/* vp8_default_zig_zag_mask generated with: + + void vp8_init_scan_order_mask() + { + int i; + + for (i = 0; i < 16; i++) + { + vp8_default_zig_zag_mask[vp8_default_zig_zag1d[i]] = 1 << i; + } + + } +*/ +DECLARE_ALIGNED(16, const short, vp8_default_zig_zag_mask[16]) = +{ + 1, 2, 32, 64, + 4, 16, 128, 4096, + 8, 256, 2048, 8192, + 512, 1024, 16384, -32768 +}; + +const int vp8_mb_feature_data_bits[MB_LVL_MAX] = {7, 6}; + +/* Array indices are identical to previously-existing CONTEXT_NODE indices */ + +const vp8_tree_index vp8_coef_tree[ 22] = /* corresponding _CONTEXT_NODEs */ +{ + -DCT_EOB_TOKEN, 2, /* 0 = EOB */ + -ZERO_TOKEN, 4, /* 1 = ZERO */ + -ONE_TOKEN, 6, /* 2 = ONE */ + 8, 12, /* 3 = LOW_VAL */ + -TWO_TOKEN, 10, /* 4 = TWO */ + -THREE_TOKEN, -FOUR_TOKEN, /* 5 = THREE */ + 14, 16, /* 6 = HIGH_LOW */ + -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2, /* 7 = CAT_ONE */ + 18, 20, /* 8 = CAT_THREEFOUR */ + -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4, /* 9 = CAT_THREE */ + -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6 /* 10 = CAT_FIVE */ +}; + +/* vp8_coef_encodings generated with: + vp8_tokens_from_tree(vp8_coef_encodings, vp8_coef_tree); +*/ +vp8_token vp8_coef_encodings[MAX_ENTROPY_TOKENS] = +{ + {2, 2}, + {6, 3}, + {28, 5}, + {58, 6}, + {59, 6}, + {60, 6}, + {61, 6}, + {124, 7}, + {125, 7}, + {126, 7}, + {127, 7}, + {0, 1} +}; + +/* Trees for extra bits. Probabilities are constant and + do not depend on previously encoded bits */ + +static const vp8_prob Pcat1[] = { 159}; +static const vp8_prob Pcat2[] = { 165, 145}; +static const vp8_prob Pcat3[] = { 173, 148, 140}; +static const vp8_prob Pcat4[] = { 176, 155, 140, 135}; +static const vp8_prob Pcat5[] = { 180, 157, 141, 134, 130}; +static const vp8_prob Pcat6[] = +{ 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129}; + + +/* tree index tables generated with: + + void init_bit_tree(vp8_tree_index *p, int n) + { + int i = 0; + + while (++i < n) + { + p[0] = p[1] = i << 1; + p += 2; + } + + p[0] = p[1] = 0; + } + + void init_bit_trees() + { + init_bit_tree(cat1, 1); + init_bit_tree(cat2, 2); + init_bit_tree(cat3, 3); + init_bit_tree(cat4, 4); + init_bit_tree(cat5, 5); + init_bit_tree(cat6, 11); + } +*/ + +static const vp8_tree_index cat1[2] = { 0, 0 }; +static const vp8_tree_index cat2[4] = { 2, 2, 0, 0 }; +static const vp8_tree_index cat3[6] = { 2, 2, 4, 4, 0, 0 }; +static const vp8_tree_index cat4[8] = { 2, 2, 4, 4, 6, 6, 0, 0 }; +static const vp8_tree_index cat5[10] = { 2, 2, 4, 4, 6, 6, 8, 8, 0, 0 }; +static const vp8_tree_index cat6[22] = { 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, + 14, 14, 16, 16, 18, 18, 20, 20, 0, 0 }; + +const vp8_extra_bit_struct vp8_extra_bits[12] = +{ + { 0, 0, 0, 0}, + { 0, 0, 0, 1}, + { 0, 0, 0, 2}, + { 0, 0, 0, 3}, + { 0, 0, 0, 4}, + { cat1, Pcat1, 1, 5}, + { cat2, Pcat2, 2, 7}, + { cat3, Pcat3, 3, 11}, + { cat4, Pcat4, 4, 19}, + { cat5, Pcat5, 5, 35}, + { cat6, Pcat6, 11, 67}, + { 0, 0, 0, 0} +}; + +#include "default_coef_probs.h" + +void vp8_default_coef_probs(VP8_COMMON *pc) +{ + vpx_memcpy(pc->fc.coef_probs, default_coef_probs, + sizeof(default_coef_probs)); +} + diff --git a/vp8/common/entropy.h b/vp8/common/entropy.h new file mode 100644 index 0000000000000000000000000000000000000000..5389bc1de4e58f884289d690f36e5028a6ac2ffb --- /dev/null +++ b/vp8/common/entropy.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef __INC_ENTROPY_H +#define __INC_ENTROPY_H + +#include "treecoder.h" +#include "blockd.h" + +/* Coefficient token alphabet */ + +#define ZERO_TOKEN 0 /* 0 Extra Bits 0+0 */ +#define ONE_TOKEN 1 /* 1 Extra Bits 0+1 */ +#define TWO_TOKEN 2 /* 2 Extra Bits 0+1 */ +#define THREE_TOKEN 3 /* 3 Extra Bits 0+1 */ +#define FOUR_TOKEN 4 /* 4 Extra Bits 0+1 */ +#define DCT_VAL_CATEGORY1 5 /* 5-6 Extra Bits 1+1 */ +#define DCT_VAL_CATEGORY2 6 /* 7-10 Extra Bits 2+1 */ +#define DCT_VAL_CATEGORY3 7 /* 11-18 Extra Bits 3+1 */ +#define DCT_VAL_CATEGORY4 8 /* 19-34 Extra Bits 4+1 */ +#define DCT_VAL_CATEGORY5 9 /* 35-66 Extra Bits 5+1 */ +#define DCT_VAL_CATEGORY6 10 /* 67+ Extra Bits 11+1 */ +#define DCT_EOB_TOKEN 11 /* EOB Extra Bits 0+0 */ + +#define MAX_ENTROPY_TOKENS 12 +#define ENTROPY_NODES 11 + +extern const vp8_tree_index vp8_coef_tree[]; + +extern const struct vp8_token_struct vp8_coef_encodings[MAX_ENTROPY_TOKENS]; + +typedef struct +{ + vp8_tree_p tree; + const vp8_prob *prob; + int Len; + int base_val; +} vp8_extra_bit_struct; + +extern const vp8_extra_bit_struct vp8_extra_bits[12]; /* indexed by token value */ + +#define PROB_UPDATE_BASELINE_COST 7 + +#define MAX_PROB 255 +#define DCT_MAX_VALUE 2048 + + +/* Coefficients are predicted via a 3-dimensional probability table. */ + +/* Outside dimension. 0 = Y no DC, 1 = Y2, 2 = UV, 3 = Y with DC */ + +#define BLOCK_TYPES 4 + +/* Middle dimension is a coarsening of the coefficient's + position within the 4x4 DCT. */ + +#define COEF_BANDS 8 +extern DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]); + +/* Inside dimension is 3-valued measure of nearby complexity, that is, + the extent to which nearby coefficients are nonzero. For the first + coefficient (DC, unless block type is 0), we look at the (already encoded) + blocks above and to the left of the current block. The context index is + then the number (0,1,or 2) of these blocks having nonzero coefficients. + After decoding a coefficient, the measure is roughly the size of the + most recently decoded coefficient (0 for 0, 1 for 1, 2 for >1). + Note that the intuitive meaning of this measure changes as coefficients + are decoded, e.g., prior to the first token, a zero means that my neighbors + are empty while, after the first token, because of the use of end-of-block, + a zero means we just decoded a zero and hence guarantees that a non-zero + coefficient will appear later in this block. However, this shift + in meaning is perfectly OK because our context depends also on the + coefficient band (and since zigzag positions 0, 1, and 2 are in + distinct bands). */ + +/*# define DC_TOKEN_CONTEXTS 3*/ /* 00, 0!0, !0!0 */ +# define PREV_COEF_CONTEXTS 3 + +extern DECLARE_ALIGNED(16, const unsigned char, vp8_prev_token_class[MAX_ENTROPY_TOKENS]); + +extern const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; + + +struct VP8Common; +void vp8_default_coef_probs(struct VP8Common *); + +extern DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]); +extern DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]); +extern DECLARE_ALIGNED(16, const short, vp8_default_zig_zag_mask[16]); +extern const int vp8_mb_feature_data_bits[MB_LVL_MAX]; + +void vp8_coef_tree_initialize(void); +#endif diff --git a/vp8/common/entropymode.c b/vp8/common/entropymode.c new file mode 100644 index 0000000000000000000000000000000000000000..091e4c732b0a90c88224e4f9cacd7ff94f738f38 --- /dev/null +++ b/vp8/common/entropymode.c @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#define USE_PREBUILT_TABLES + +#include "entropymode.h" +#include "entropy.h" +#include "vpx_mem/vpx_mem.h" + +#include "vp8_entropymodedata.h" + +int vp8_mv_cont(const int_mv *l, const int_mv *a) +{ + int lez = (l->as_int == 0); + int aez = (a->as_int == 0); + int lea = (l->as_int == a->as_int); + + if (lea && lez) + return SUBMVREF_LEFT_ABOVE_ZED; + + if (lea) + return SUBMVREF_LEFT_ABOVE_SAME; + + if (aez) + return SUBMVREF_ABOVE_ZED; + + if (lez) + return SUBMVREF_LEFT_ZED; + + return SUBMVREF_NORMAL; +} + +static const vp8_prob sub_mv_ref_prob [VP8_SUBMVREFS-1] = { 180, 162, 25}; + +const vp8_prob vp8_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP8_SUBMVREFS-1] = +{ + { 147, 136, 18 }, + { 106, 145, 1 }, + { 179, 121, 1 }, + { 223, 1 , 34 }, + { 208, 1 , 1 } +}; + + + +const vp8_mbsplit vp8_mbsplits [VP8_NUMMBSPLITS] = +{ + { + 0, 0, 0, 0, + 0, 0, 0, 0, + 1, 1, 1, 1, + 1, 1, 1, 1, + }, + { + 0, 0, 1, 1, + 0, 0, 1, 1, + 0, 0, 1, 1, + 0, 0, 1, 1, + }, + { + 0, 0, 1, 1, + 0, 0, 1, 1, + 2, 2, 3, 3, + 2, 2, 3, 3, + }, + { + 0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, 14, 15, + } +}; + +const int vp8_mbsplit_count [VP8_NUMMBSPLITS] = { 2, 2, 4, 16}; + +const vp8_prob vp8_mbsplit_probs [VP8_NUMMBSPLITS-1] = { 110, 111, 150}; + + +/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */ + +const vp8_tree_index vp8_bmode_tree[18] = /* INTRAMODECONTEXTNODE value */ +{ + -B_DC_PRED, 2, /* 0 = DC_NODE */ + -B_TM_PRED, 4, /* 1 = TM_NODE */ + -B_VE_PRED, 6, /* 2 = VE_NODE */ + 8, 12, /* 3 = COM_NODE */ + -B_HE_PRED, 10, /* 4 = HE_NODE */ + -B_RD_PRED, -B_VR_PRED, /* 5 = RD_NODE */ + -B_LD_PRED, 14, /* 6 = LD_NODE */ + -B_VL_PRED, 16, /* 7 = VL_NODE */ + -B_HD_PRED, -B_HU_PRED /* 8 = HD_NODE */ +}; + +/* Again, these trees use the same probability indices as their + explicitly-programmed predecessors. */ + +const vp8_tree_index vp8_ymode_tree[8] = +{ + -DC_PRED, 2, + 4, 6, + -V_PRED, -H_PRED, + -TM_PRED, -B_PRED +}; + +const vp8_tree_index vp8_kf_ymode_tree[8] = +{ + -B_PRED, 2, + 4, 6, + -DC_PRED, -V_PRED, + -H_PRED, -TM_PRED +}; + +const vp8_tree_index vp8_uv_mode_tree[6] = +{ + -DC_PRED, 2, + -V_PRED, 4, + -H_PRED, -TM_PRED +}; + +const vp8_tree_index vp8_mbsplit_tree[6] = +{ + -3, 2, + -2, 4, + -0, -1 +}; + +const vp8_tree_index vp8_mv_ref_tree[8] = +{ + -ZEROMV, 2, + -NEARESTMV, 4, + -NEARMV, 6, + -NEWMV, -SPLITMV +}; + +const vp8_tree_index vp8_sub_mv_ref_tree[6] = +{ + -LEFT4X4, 2, + -ABOVE4X4, 4, + -ZERO4X4, -NEW4X4 +}; + +const vp8_tree_index vp8_small_mvtree [14] = +{ + 2, 8, + 4, 6, + -0, -1, + -2, -3, + 10, 12, + -4, -5, + -6, -7 +}; + +void vp8_init_mbmode_probs(VP8_COMMON *x) +{ + vpx_memcpy(x->fc.ymode_prob, vp8_ymode_prob, sizeof(vp8_ymode_prob)); + vpx_memcpy(x->fc.uv_mode_prob, vp8_uv_mode_prob, sizeof(vp8_uv_mode_prob)); + vpx_memcpy(x->fc.sub_mv_ref_prob, sub_mv_ref_prob, sizeof(sub_mv_ref_prob)); +} + +void vp8_default_bmode_probs(vp8_prob p [VP8_BINTRAMODES-1]) +{ + vpx_memcpy(p, vp8_bmode_prob, sizeof(vp8_bmode_prob)); +} + diff --git a/vp8/common/entropymode.h b/vp8/common/entropymode.h new file mode 100644 index 0000000000000000000000000000000000000000..1df0f641e496987bf4ccfa34178a66c4881af187 --- /dev/null +++ b/vp8/common/entropymode.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef __INC_ENTROPYMODE_H +#define __INC_ENTROPYMODE_H + +#include "onyxc_int.h" +#include "treecoder.h" + +typedef enum +{ + SUBMVREF_NORMAL, + SUBMVREF_LEFT_ZED, + SUBMVREF_ABOVE_ZED, + SUBMVREF_LEFT_ABOVE_SAME, + SUBMVREF_LEFT_ABOVE_ZED +} sumvfref_t; + +typedef int vp8_mbsplit[16]; + +#define VP8_NUMMBSPLITS 4 + +extern const vp8_mbsplit vp8_mbsplits [VP8_NUMMBSPLITS]; + +extern const int vp8_mbsplit_count [VP8_NUMMBSPLITS]; /* # of subsets */ + +extern const vp8_prob vp8_mbsplit_probs [VP8_NUMMBSPLITS-1]; + +extern int vp8_mv_cont(const int_mv *l, const int_mv *a); +#define SUBMVREF_COUNT 5 +extern const vp8_prob vp8_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP8_SUBMVREFS-1]; + + +extern const unsigned int vp8_kf_default_bmode_counts [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES]; + + +extern const vp8_tree_index vp8_bmode_tree[]; + +extern const vp8_tree_index vp8_ymode_tree[]; +extern const vp8_tree_index vp8_kf_ymode_tree[]; +extern const vp8_tree_index vp8_uv_mode_tree[]; + +extern const vp8_tree_index vp8_mbsplit_tree[]; +extern const vp8_tree_index vp8_mv_ref_tree[]; +extern const vp8_tree_index vp8_sub_mv_ref_tree[]; + +extern const struct vp8_token_struct vp8_bmode_encodings[VP8_BINTRAMODES]; +extern const struct vp8_token_struct vp8_ymode_encodings[VP8_YMODES]; +extern const struct vp8_token_struct vp8_kf_ymode_encodings[VP8_YMODES]; +extern const struct vp8_token_struct vp8_uv_mode_encodings[VP8_UV_MODES]; +extern const struct vp8_token_struct vp8_mbsplit_encodings[VP8_NUMMBSPLITS]; + +/* Inter mode values do not start at zero */ + +extern const struct vp8_token_struct vp8_mv_ref_encoding_array[VP8_MVREFS]; +extern const struct vp8_token_struct vp8_sub_mv_ref_encoding_array[VP8_SUBMVREFS]; + +extern const vp8_tree_index vp8_small_mvtree[]; + +extern const struct vp8_token_struct vp8_small_mvencodings[8]; + +/* Key frame default mode probs */ +extern const vp8_prob vp8_kf_bmode_prob[VP8_BINTRAMODES][VP8_BINTRAMODES] +[VP8_BINTRAMODES-1]; +extern const vp8_prob vp8_kf_uv_mode_prob[VP8_UV_MODES-1]; +extern const vp8_prob vp8_kf_ymode_prob[VP8_YMODES-1]; + +void vp8_init_mbmode_probs(VP8_COMMON *x); +void vp8_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES-1]); +void vp8_kf_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES-1]); + +#endif diff --git a/vp8/common/entropymv.c b/vp8/common/entropymv.c new file mode 100644 index 0000000000000000000000000000000000000000..e5df1f095555145f3d98e79e88b0d5939a55651a --- /dev/null +++ b/vp8/common/entropymv.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "entropymv.h" + +const MV_CONTEXT vp8_mv_update_probs[2] = +{ + {{ + 237, + 246, + 253, 253, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 250, 250, 252, 254, 254 + }}, + {{ + 231, + 243, + 245, 253, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 251, 251, 254, 254, 254 + }} +}; +const MV_CONTEXT vp8_default_mv_context[2] = +{ + {{ + /* row */ + 162, /* is short */ + 128, /* sign */ + 225, 146, 172, 147, 214, 39, 156, /* short tree */ + 128, 129, 132, 75, 145, 178, 206, 239, 254, 254 /* long bits */ + }}, + + + + {{ + /* same for column */ + 164, /* is short */ + 128, + 204, 170, 119, 235, 140, 230, 228, + 128, 130, 130, 74, 148, 180, 203, 236, 254, 254 /* long bits */ + + }} +}; diff --git a/vp8/common/entropymv.h b/vp8/common/entropymv.h new file mode 100644 index 0000000000000000000000000000000000000000..2db1e385baeaf64ed0a49ba7f7e505cd38b4589d --- /dev/null +++ b/vp8/common/entropymv.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef __INC_ENTROPYMV_H +#define __INC_ENTROPYMV_H + +#include "treecoder.h" + +enum +{ + mv_max = 1023, /* max absolute value of a MV component */ + MVvals = (2 * mv_max) + 1, /* # possible values "" */ + mvfp_max = 255, /* max absolute value of a full pixel MV component */ + MVfpvals = (2 * mvfp_max) +1, /* # possible full pixel MV values */ + + mvlong_width = 10, /* Large MVs have 9 bit magnitudes */ + mvnum_short = 8, /* magnitudes 0 through 7 */ + + /* probability offsets for coding each MV component */ + + mvpis_short = 0, /* short (<= 7) vs long (>= 8) */ + MVPsign, /* sign for non-zero */ + MVPshort, /* 8 short values = 7-position tree */ + + MVPbits = MVPshort + mvnum_short - 1, /* mvlong_width long value bits */ + MVPcount = MVPbits + mvlong_width /* (with independent probabilities) */ +}; + +typedef struct mv_context +{ + vp8_prob prob[MVPcount]; /* often come in row, col pairs */ +} MV_CONTEXT; + +extern const MV_CONTEXT vp8_mv_update_probs[2], vp8_default_mv_context[2]; + +#endif diff --git a/vp8/common/extend.c b/vp8/common/extend.c new file mode 100644 index 0000000000000000000000000000000000000000..c9bdd21897df076ddf82e467070f37a9a880cd8e --- /dev/null +++ b/vp8/common/extend.c @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "extend.h" +#include "vpx_mem/vpx_mem.h" + + +static void copy_and_extend_plane +( + unsigned char *s, /* source */ + int sp, /* source pitch */ + unsigned char *d, /* destination */ + int dp, /* destination pitch */ + int h, /* height */ + int w, /* width */ + int et, /* extend top border */ + int el, /* extend left border */ + int eb, /* extend bottom border */ + int er /* extend right border */ +) +{ + int i; + unsigned char *src_ptr1, *src_ptr2; + unsigned char *dest_ptr1, *dest_ptr2; + int linesize; + + /* copy the left and right most columns out */ + src_ptr1 = s; + src_ptr2 = s + w - 1; + dest_ptr1 = d - el; + dest_ptr2 = d + w; + + for (i = 0; i < h; i++) + { + vpx_memset(dest_ptr1, src_ptr1[0], el); + vpx_memcpy(dest_ptr1 + el, src_ptr1, w); + vpx_memset(dest_ptr2, src_ptr2[0], er); + src_ptr1 += sp; + src_ptr2 += sp; + dest_ptr1 += dp; + dest_ptr2 += dp; + } + + /* Now copy the top and bottom lines into each line of the respective + * borders + */ + src_ptr1 = d - el; + src_ptr2 = d + dp * (h - 1) - el; + dest_ptr1 = d + dp * (-et) - el; + dest_ptr2 = d + dp * (h) - el; + linesize = el + er + w; + + for (i = 0; i < et; i++) + { + vpx_memcpy(dest_ptr1, src_ptr1, linesize); + dest_ptr1 += dp; + } + + for (i = 0; i < eb; i++) + { + vpx_memcpy(dest_ptr2, src_ptr2, linesize); + dest_ptr2 += dp; + } +} + + +void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst) +{ + int et = dst->border; + int el = dst->border; + int eb = dst->border + dst->y_height - src->y_height; + int er = dst->border + dst->y_width - src->y_width; + + copy_and_extend_plane(src->y_buffer, src->y_stride, + dst->y_buffer, dst->y_stride, + src->y_height, src->y_width, + et, el, eb, er); + + et = dst->border >> 1; + el = dst->border >> 1; + eb = (dst->border >> 1) + dst->uv_height - src->uv_height; + er = (dst->border >> 1) + dst->uv_width - src->uv_width; + + copy_and_extend_plane(src->u_buffer, src->uv_stride, + dst->u_buffer, dst->uv_stride, + src->uv_height, src->uv_width, + et, el, eb, er); + + copy_and_extend_plane(src->v_buffer, src->uv_stride, + dst->v_buffer, dst->uv_stride, + src->uv_height, src->uv_width, + et, el, eb, er); +} + + +void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + int srcy, int srcx, + int srch, int srcw) +{ + int et = dst->border; + int el = dst->border; + int eb = dst->border + dst->y_height - src->y_height; + int er = dst->border + dst->y_width - src->y_width; + int src_y_offset = srcy * src->y_stride + srcx; + int dst_y_offset = srcy * dst->y_stride + srcx; + int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1); + int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1); + + /* If the side is not touching the bounder then don't extend. */ + if (srcy) + et = 0; + if (srcx) + el = 0; + if (srcy + srch != src->y_height) + eb = 0; + if (srcx + srcw != src->y_width) + er = 0; + + copy_and_extend_plane(src->y_buffer + src_y_offset, + src->y_stride, + dst->y_buffer + dst_y_offset, + dst->y_stride, + srch, srcw, + et, el, eb, er); + + et = (et + 1) >> 1; + el = (el + 1) >> 1; + eb = (eb + 1) >> 1; + er = (er + 1) >> 1; + srch = (srch + 1) >> 1; + srcw = (srcw + 1) >> 1; + + copy_and_extend_plane(src->u_buffer + src_uv_offset, + src->uv_stride, + dst->u_buffer + dst_uv_offset, + dst->uv_stride, + srch, srcw, + et, el, eb, er); + + copy_and_extend_plane(src->v_buffer + src_uv_offset, + src->uv_stride, + dst->v_buffer + dst_uv_offset, + dst->uv_stride, + srch, srcw, + et, el, eb, er); +} + + +/* note the extension is only for the last row, for intra prediction purpose */ +void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, + unsigned char *YPtr, + unsigned char *UPtr, + unsigned char *VPtr) +{ + int i; + + YPtr += ybf->y_stride * 14; + UPtr += ybf->uv_stride * 6; + VPtr += ybf->uv_stride * 6; + + for (i = 0; i < 4; i++) + { + YPtr[i] = YPtr[-1]; + UPtr[i] = UPtr[-1]; + VPtr[i] = VPtr[-1]; + } + + YPtr += ybf->y_stride; + UPtr += ybf->uv_stride; + VPtr += ybf->uv_stride; + + for (i = 0; i < 4; i++) + { + YPtr[i] = YPtr[-1]; + UPtr[i] = UPtr[-1]; + VPtr[i] = VPtr[-1]; + } +} diff --git a/vp8/common/extend.h b/vp8/common/extend.h new file mode 100644 index 0000000000000000000000000000000000000000..74a0b177d94d211d7889ce684720a282b5d4ec56 --- /dev/null +++ b/vp8/common/extend.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef __INC_EXTEND_H +#define __INC_EXTEND_H + +#include "vpx_scale/yv12config.h" + +void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, unsigned char *UPtr, unsigned char *VPtr); +void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst); +void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + int srcy, int srcx, + int srch, int srcw); + +#endif diff --git a/vp8/common/filter.c b/vp8/common/filter.c new file mode 100644 index 0000000000000000000000000000000000000000..1901ea3b6664aaa453f568b9cad6d47cea6011cf --- /dev/null +++ b/vp8/common/filter.c @@ -0,0 +1,494 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include +#include "filter.h" +#include "vpx_ports/mem.h" + +DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]) = +{ + { 128, 0 }, + { 112, 16 }, + { 96, 32 }, + { 80, 48 }, + { 64, 64 }, + { 48, 80 }, + { 32, 96 }, + { 16, 112 } +}; + +DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]) = +{ + + { 0, 0, 128, 0, 0, 0 }, /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */ + { 0, -6, 123, 12, -1, 0 }, + { 2, -11, 108, 36, -8, 1 }, /* New 1/4 pel 6 tap filter */ + { 0, -9, 93, 50, -6, 0 }, + { 3, -16, 77, 77, -16, 3 }, /* New 1/2 pel 6 tap filter */ + { 0, -6, 50, 93, -9, 0 }, + { 1, -8, 36, 108, -11, 2 }, /* New 1/4 pel 6 tap filter */ + { 0, -1, 12, 123, -6, 0 }, +}; + +static void filter_block2d_first_pass +( + unsigned char *src_ptr, + int *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +) +{ + unsigned int i, j; + int Temp; + + for (i = 0; i < output_height; i++) + { + for (j = 0; j < output_width; j++) + { + Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) + + ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) + + ((int)src_ptr[0] * vp8_filter[2]) + + ((int)src_ptr[pixel_step] * vp8_filter[3]) + + ((int)src_ptr[2*pixel_step] * vp8_filter[4]) + + ((int)src_ptr[3*pixel_step] * vp8_filter[5]) + + (VP8_FILTER_WEIGHT >> 1); /* Rounding */ + + /* Normalize back to 0-255 */ + Temp = Temp >> VP8_FILTER_SHIFT; + + if (Temp < 0) + Temp = 0; + else if (Temp > 255) + Temp = 255; + + output_ptr[j] = Temp; + src_ptr++; + } + + /* Next row... */ + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + +static void filter_block2d_second_pass +( + int *src_ptr, + unsigned char *output_ptr, + int output_pitch, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +) +{ + unsigned int i, j; + int Temp; + + for (i = 0; i < output_height; i++) + { + for (j = 0; j < output_width; j++) + { + /* Apply filter */ + Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) + + ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) + + ((int)src_ptr[0] * vp8_filter[2]) + + ((int)src_ptr[pixel_step] * vp8_filter[3]) + + ((int)src_ptr[2*pixel_step] * vp8_filter[4]) + + ((int)src_ptr[3*pixel_step] * vp8_filter[5]) + + (VP8_FILTER_WEIGHT >> 1); /* Rounding */ + + /* Normalize back to 0-255 */ + Temp = Temp >> VP8_FILTER_SHIFT; + + if (Temp < 0) + Temp = 0; + else if (Temp > 255) + Temp = 255; + + output_ptr[j] = (unsigned char)Temp; + src_ptr++; + } + + /* Start next row */ + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_pitch; + } +} + + +static void filter_block2d +( + unsigned char *src_ptr, + unsigned char *output_ptr, + unsigned int src_pixels_per_line, + int output_pitch, + const short *HFilter, + const short *VFilter +) +{ + int FData[9*4]; /* Temp data buffer used in filtering */ + + /* First filter 1-D horizontally... */ + filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 4, HFilter); + + /* then filter verticaly... */ + filter_block2d_second_pass(FData + 8, output_ptr, output_pitch, 4, 4, 4, 4, VFilter); +} + + +void vp8_sixtap_predict4x4_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + + HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ + VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ + + filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter); +} +void vp8_sixtap_predict8x8_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + int FData[13*16]; /* Temp data buffer used in filtering */ + + HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ + VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ + + /* First filter 1-D horizontally... */ + filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 13, 8, HFilter); + + + /* then filter verticaly... */ + filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter); + +} + +void vp8_sixtap_predict8x4_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + int FData[13*16]; /* Temp data buffer used in filtering */ + + HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ + VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ + + /* First filter 1-D horizontally... */ + filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 8, HFilter); + + + /* then filter verticaly... */ + filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter); + +} + +void vp8_sixtap_predict16x16_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + int FData[21*24]; /* Temp data buffer used in filtering */ + + + HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ + VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ + + /* First filter 1-D horizontally... */ + filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 21, 16, HFilter); + + /* then filter verticaly... */ + filter_block2d_second_pass(FData + 32, dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter); + +} + + +/**************************************************************************** + * + * ROUTINE : filter_block2d_bil_first_pass + * + * INPUTS : UINT8 *src_ptr : Pointer to source block. + * UINT32 src_stride : Stride of source block. + * UINT32 height : Block height. + * UINT32 width : Block width. + * INT32 *vp8_filter : Array of 2 bi-linear filter taps. + * + * OUTPUTS : INT32 *dst_ptr : Pointer to filtered block. + * + * RETURNS : void + * + * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block + * in the horizontal direction to produce the filtered output + * block. Used to implement first-pass of 2-D separable filter. + * + * SPECIAL NOTES : Produces INT32 output to retain precision for next pass. + * Two filter taps should sum to VP8_FILTER_WEIGHT. + * + ****************************************************************************/ +static void filter_block2d_bil_first_pass +( + unsigned char *src_ptr, + unsigned short *dst_ptr, + unsigned int src_stride, + unsigned int height, + unsigned int width, + const short *vp8_filter +) +{ + unsigned int i, j; + + for (i = 0; i < height; i++) + { + for (j = 0; j < width; j++) + { + /* Apply bilinear filter */ + dst_ptr[j] = (((int)src_ptr[0] * vp8_filter[0]) + + ((int)src_ptr[1] * vp8_filter[1]) + + (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT; + src_ptr++; + } + + /* Next row... */ + src_ptr += src_stride - width; + dst_ptr += width; + } +} + +/**************************************************************************** + * + * ROUTINE : filter_block2d_bil_second_pass + * + * INPUTS : INT32 *src_ptr : Pointer to source block. + * UINT32 dst_pitch : Destination block pitch. + * UINT32 height : Block height. + * UINT32 width : Block width. + * INT32 *vp8_filter : Array of 2 bi-linear filter taps. + * + * OUTPUTS : UINT16 *dst_ptr : Pointer to filtered block. + * + * RETURNS : void + * + * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block + * in the vertical direction to produce the filtered output + * block. Used to implement second-pass of 2-D separable filter. + * + * SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass. + * Two filter taps should sum to VP8_FILTER_WEIGHT. + * + ****************************************************************************/ +static void filter_block2d_bil_second_pass +( + unsigned short *src_ptr, + unsigned char *dst_ptr, + int dst_pitch, + unsigned int height, + unsigned int width, + const short *vp8_filter +) +{ + unsigned int i, j; + int Temp; + + for (i = 0; i < height; i++) + { + for (j = 0; j < width; j++) + { + /* Apply filter */ + Temp = ((int)src_ptr[0] * vp8_filter[0]) + + ((int)src_ptr[width] * vp8_filter[1]) + + (VP8_FILTER_WEIGHT / 2); + dst_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT); + src_ptr++; + } + + /* Next row... */ + dst_ptr += dst_pitch; + } +} + + +/**************************************************************************** + * + * ROUTINE : filter_block2d_bil + * + * INPUTS : UINT8 *src_ptr : Pointer to source block. + * UINT32 src_pitch : Stride of source block. + * UINT32 dst_pitch : Stride of destination block. + * INT32 *HFilter : Array of 2 horizontal filter taps. + * INT32 *VFilter : Array of 2 vertical filter taps. + * INT32 Width : Block width + * INT32 Height : Block height + * + * OUTPUTS : UINT16 *dst_ptr : Pointer to filtered block. + * + * RETURNS : void + * + * FUNCTION : 2-D filters an input block by applying a 2-tap + * bi-linear filter horizontally followed by a 2-tap + * bi-linear filter vertically on the result. + * + * SPECIAL NOTES : The largest block size can be handled here is 16x16 + * + ****************************************************************************/ +static void filter_block2d_bil +( + unsigned char *src_ptr, + unsigned char *dst_ptr, + unsigned int src_pitch, + unsigned int dst_pitch, + const short *HFilter, + const short *VFilter, + int Width, + int Height +) +{ + + unsigned short FData[17*16]; /* Temp data buffer used in filtering */ + + /* First filter 1-D horizontally... */ + filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter); + + /* then 1-D vertically... */ + filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter); +} + + +void vp8_bilinear_predict4x4_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; +#if 0 + { + int i; + unsigned char temp1[16]; + unsigned char temp2[16]; + + bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4); + filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4); + + for (i = 0; i < 16; i++) + { + if (temp1[i] != temp2[i]) + { + bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4); + filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4); + } + } + } +#endif + filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4); + +} + +void vp8_bilinear_predict8x8_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8); + +} + +void vp8_bilinear_predict8x4_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4); + +} + +void vp8_bilinear_predict16x16_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16); +} diff --git a/vp8/common/filter.h b/vp8/common/filter.h new file mode 100644 index 0000000000000000000000000000000000000000..b7591f268be7e27492c89cf2d22746e9143849e9 --- /dev/null +++ b/vp8/common/filter.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef FILTER_H +#define FILTER_H + +#define BLOCK_HEIGHT_WIDTH 4 +#define VP8_FILTER_WEIGHT 128 +#define VP8_FILTER_SHIFT 7 + +extern const short vp8_bilinear_filters[8][2]; +extern const short vp8_sub_pel_filters[8][6]; + +#endif diff --git a/vp8/common/findnearmv.c b/vp8/common/findnearmv.c new file mode 100644 index 0000000000000000000000000000000000000000..e8ee40f56c6d1a5349d256c07c0f845c4c74c379 --- /dev/null +++ b/vp8/common/findnearmv.c @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "findnearmv.h" + +const unsigned char vp8_mbsplit_offset[4][16] = { + { 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + { 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + { 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15} +}; + +/* Predict motion vectors using those from already-decoded nearby blocks. + Note that we only consider one 4x4 subblock from each candidate 16x16 + macroblock. */ +void vp8_find_near_mvs +( + MACROBLOCKD *xd, + const MODE_INFO *here, + int_mv *nearest, + int_mv *nearby, + int_mv *best_mv, + int cnt[4], + int refframe, + int *ref_frame_sign_bias +) +{ + const MODE_INFO *above = here - xd->mode_info_stride; + const MODE_INFO *left = here - 1; + const MODE_INFO *aboveleft = above - 1; + int_mv near_mvs[4]; + int_mv *mv = near_mvs; + int *cntx = cnt; + enum {CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV}; + + /* Zero accumulators */ + mv[0].as_int = mv[1].as_int = mv[2].as_int = 0; + cnt[0] = cnt[1] = cnt[2] = cnt[3] = 0; + + /* Process above */ + if (above->mbmi.ref_frame != INTRA_FRAME) + { + if (above->mbmi.mv.as_int) + { + (++mv)->as_int = above->mbmi.mv.as_int; + mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame], refframe, mv, ref_frame_sign_bias); + ++cntx; + } + + *cntx += 2; + } + + /* Process left */ + if (left->mbmi.ref_frame != INTRA_FRAME) + { + if (left->mbmi.mv.as_int) + { + int_mv this_mv; + + this_mv.as_int = left->mbmi.mv.as_int; + mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame], refframe, &this_mv, ref_frame_sign_bias); + + if (this_mv.as_int != mv->as_int) + { + (++mv)->as_int = this_mv.as_int; + ++cntx; + } + + *cntx += 2; + } + else + cnt[CNT_INTRA] += 2; + } + + /* Process above left */ + if (aboveleft->mbmi.ref_frame != INTRA_FRAME) + { + if (aboveleft->mbmi.mv.as_int) + { + int_mv this_mv; + + this_mv.as_int = aboveleft->mbmi.mv.as_int; + mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame], refframe, &this_mv, ref_frame_sign_bias); + + if (this_mv.as_int != mv->as_int) + { + (++mv)->as_int = this_mv.as_int; + ++cntx; + } + + *cntx += 1; + } + else + cnt[CNT_INTRA] += 1; + } + + /* If we have three distinct MV's ... */ + if (cnt[CNT_SPLITMV]) + { + /* See if above-left MV can be merged with NEAREST */ + if (mv->as_int == near_mvs[CNT_NEAREST].as_int) + cnt[CNT_NEAREST] += 1; + } + + cnt[CNT_SPLITMV] = ((above->mbmi.mode == SPLITMV) + + (left->mbmi.mode == SPLITMV)) * 2 + + (aboveleft->mbmi.mode == SPLITMV); + + /* Swap near and nearest if necessary */ + if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) + { + int tmp; + tmp = cnt[CNT_NEAREST]; + cnt[CNT_NEAREST] = cnt[CNT_NEAR]; + cnt[CNT_NEAR] = tmp; + tmp = near_mvs[CNT_NEAREST].as_int; + near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int; + near_mvs[CNT_NEAR].as_int = tmp; + } + + /* Use near_mvs[0] to store the "best" MV */ + if (cnt[CNT_NEAREST] >= cnt[CNT_INTRA]) + near_mvs[CNT_INTRA] = near_mvs[CNT_NEAREST]; + + /* Set up return values */ + best_mv->as_int = near_mvs[0].as_int; + nearest->as_int = near_mvs[CNT_NEAREST].as_int; + nearby->as_int = near_mvs[CNT_NEAR].as_int; +} + + +static void invert_and_clamp_mvs(int_mv *inv, int_mv *src, MACROBLOCKD *xd) +{ + inv->as_mv.row = src->as_mv.row * -1; + inv->as_mv.col = src->as_mv.col * -1; + vp8_clamp_mv2(inv, xd); + vp8_clamp_mv2(src, xd); +} + + +int vp8_find_near_mvs_bias +( + MACROBLOCKD *xd, + const MODE_INFO *here, + int_mv mode_mv_sb[2][MB_MODE_COUNT], + int_mv best_mv_sb[2], + int cnt[4], + int refframe, + int *ref_frame_sign_bias +) +{ + int sign_bias = ref_frame_sign_bias[refframe]; + + vp8_find_near_mvs(xd, + here, + &mode_mv_sb[sign_bias][NEARESTMV], + &mode_mv_sb[sign_bias][NEARMV], + &best_mv_sb[sign_bias], + cnt, + refframe, + ref_frame_sign_bias); + + invert_and_clamp_mvs(&mode_mv_sb[!sign_bias][NEARESTMV], + &mode_mv_sb[sign_bias][NEARESTMV], xd); + invert_and_clamp_mvs(&mode_mv_sb[!sign_bias][NEARMV], + &mode_mv_sb[sign_bias][NEARMV], xd); + invert_and_clamp_mvs(&best_mv_sb[!sign_bias], + &best_mv_sb[sign_bias], xd); + + return sign_bias; +} + + +vp8_prob *vp8_mv_ref_probs( + vp8_prob p[VP8_MVREFS-1], const int near_mv_ref_ct[4] +) +{ + p[0] = vp8_mode_contexts [near_mv_ref_ct[0]] [0]; + p[1] = vp8_mode_contexts [near_mv_ref_ct[1]] [1]; + p[2] = vp8_mode_contexts [near_mv_ref_ct[2]] [2]; + p[3] = vp8_mode_contexts [near_mv_ref_ct[3]] [3]; + /*p[3] = vp8_mode_contexts [near_mv_ref_ct[1] + near_mv_ref_ct[2] + near_mv_ref_ct[3]] [3];*/ + return p; +} + diff --git a/vp8/common/findnearmv.h b/vp8/common/findnearmv.h new file mode 100644 index 0000000000000000000000000000000000000000..06ef060c2d6d17c2474245d6ff7c871ef0d87b14 --- /dev/null +++ b/vp8/common/findnearmv.h @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef __INC_FINDNEARMV_H +#define __INC_FINDNEARMV_H + +#include "mv.h" +#include "blockd.h" +#include "modecont.h" +#include "treecoder.h" + + +static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, int_mv *mvp, + const int *ref_frame_sign_bias) +{ + if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe]) + { + mvp->as_mv.row *= -1; + mvp->as_mv.col *= -1; + } +} + +#define LEFT_TOP_MARGIN (16 << 3) +#define RIGHT_BOTTOM_MARGIN (16 << 3) +static void vp8_clamp_mv2(int_mv *mv, const MACROBLOCKD *xd) +{ + if (mv->as_mv.col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN)) + mv->as_mv.col = xd->mb_to_left_edge - LEFT_TOP_MARGIN; + else if (mv->as_mv.col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN) + mv->as_mv.col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN; + + if (mv->as_mv.row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN)) + mv->as_mv.row = xd->mb_to_top_edge - LEFT_TOP_MARGIN; + else if (mv->as_mv.row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN) + mv->as_mv.row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN; +} + +static void vp8_clamp_mv(int_mv *mv, int mb_to_left_edge, int mb_to_right_edge, + int mb_to_top_edge, int mb_to_bottom_edge) +{ + mv->as_mv.col = (mv->as_mv.col < mb_to_left_edge) ? + mb_to_left_edge : mv->as_mv.col; + mv->as_mv.col = (mv->as_mv.col > mb_to_right_edge) ? + mb_to_right_edge : mv->as_mv.col; + mv->as_mv.row = (mv->as_mv.row < mb_to_top_edge) ? + mb_to_top_edge : mv->as_mv.row; + mv->as_mv.row = (mv->as_mv.row > mb_to_bottom_edge) ? + mb_to_bottom_edge : mv->as_mv.row; +} +static unsigned int vp8_check_mv_bounds(int_mv *mv, int mb_to_left_edge, + int mb_to_right_edge, int mb_to_top_edge, + int mb_to_bottom_edge) +{ + unsigned int need_to_clamp; + need_to_clamp = (mv->as_mv.col < mb_to_left_edge); + need_to_clamp |= (mv->as_mv.col > mb_to_right_edge); + need_to_clamp |= (mv->as_mv.row < mb_to_top_edge); + need_to_clamp |= (mv->as_mv.row > mb_to_bottom_edge); + return need_to_clamp; +} + +void vp8_find_near_mvs +( + MACROBLOCKD *xd, + const MODE_INFO *here, + int_mv *nearest, int_mv *nearby, int_mv *best, + int near_mv_ref_cts[4], + int refframe, + int *ref_frame_sign_bias +); + + +int vp8_find_near_mvs_bias +( + MACROBLOCKD *xd, + const MODE_INFO *here, + int_mv mode_mv_sb[2][MB_MODE_COUNT], + int_mv best_mv_sb[2], + int cnt[4], + int refframe, + int *ref_frame_sign_bias +); + + +vp8_prob *vp8_mv_ref_probs( + vp8_prob p[VP8_MVREFS-1], const int near_mv_ref_ct[4] +); + +extern const unsigned char vp8_mbsplit_offset[4][16]; + + +static int left_block_mv(const MODE_INFO *cur_mb, int b) +{ + if (!(b & 3)) + { + /* On L edge, get from MB to left of us */ + --cur_mb; + + if(cur_mb->mbmi.mode != SPLITMV) + return cur_mb->mbmi.mv.as_int; + b += 4; + } + + return (cur_mb->bmi + b - 1)->mv.as_int; +} + +static int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride) +{ + if (!(b >> 2)) + { + /* On top edge, get from MB above us */ + cur_mb -= mi_stride; + + if(cur_mb->mbmi.mode != SPLITMV) + return cur_mb->mbmi.mv.as_int; + b += 16; + } + + return (cur_mb->bmi + b - 4)->mv.as_int; +} +static B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) +{ + if (!(b & 3)) + { + /* On L edge, get from MB to left of us */ + --cur_mb; + switch (cur_mb->mbmi.mode) + { + case B_PRED: + return (cur_mb->bmi + b + 3)->as_mode; + case DC_PRED: + return B_DC_PRED; + case V_PRED: + return B_VE_PRED; + case H_PRED: + return B_HE_PRED; + case TM_PRED: + return B_TM_PRED; + default: + return B_DC_PRED; + } + } + + return (cur_mb->bmi + b - 1)->as_mode; +} + +static B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, int b, int mi_stride) +{ + if (!(b >> 2)) + { + /* On top edge, get from MB above us */ + cur_mb -= mi_stride; + + switch (cur_mb->mbmi.mode) + { + case B_PRED: + return (cur_mb->bmi + b + 12)->as_mode; + case DC_PRED: + return B_DC_PRED; + case V_PRED: + return B_VE_PRED; + case H_PRED: + return B_HE_PRED; + case TM_PRED: + return B_TM_PRED; + default: + return B_DC_PRED; + } + } + + return (cur_mb->bmi + b - 4)->as_mode; +} + +#endif diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c new file mode 100644 index 0000000000000000000000000000000000000000..5a6ac7b0e3a5d12729bc8341c735a0993741c30f --- /dev/null +++ b/vp8/common/generic/systemdependent.c @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vpx_rtcd.h" +#if ARCH_ARM +#include "vpx_ports/arm.h" +#elif ARCH_X86 || ARCH_X86_64 +#include "vpx_ports/x86.h" +#endif +#include "vp8/common/onyxc_int.h" + +#if CONFIG_MULTITHREAD +#if HAVE_UNISTD_H && !defined(__OS2__) +#include +#elif defined(_WIN32) +#include +typedef void (WINAPI *PGNSI)(LPSYSTEM_INFO); +#elif defined(__OS2__) +#define INCL_DOS +#define INCL_DOSSPINLOCK +#include +#endif +#endif + +#if CONFIG_MULTITHREAD +static int get_cpu_count() +{ + int core_count = 16; + +#if HAVE_UNISTD_H && !defined(__OS2__) +#if defined(_SC_NPROCESSORS_ONLN) + core_count = sysconf(_SC_NPROCESSORS_ONLN); +#elif defined(_SC_NPROC_ONLN) + core_count = sysconf(_SC_NPROC_ONLN); +#endif +#elif defined(_WIN32) + { + PGNSI pGNSI; + SYSTEM_INFO sysinfo; + + /* Call GetNativeSystemInfo if supported or + * GetSystemInfo otherwise. */ + + pGNSI = (PGNSI) GetProcAddress( + GetModuleHandle(TEXT("kernel32.dll")), "GetNativeSystemInfo"); + if (pGNSI != NULL) + pGNSI(&sysinfo); + else + GetSystemInfo(&sysinfo); + + core_count = sysinfo.dwNumberOfProcessors; + } +#elif defined(__OS2__) + { + ULONG proc_id; + ULONG status; + + core_count = 0; + for (proc_id = 1; ; proc_id++) + { + if (DosGetProcessorStatus(proc_id, &status)) + break; + + if (status == PROC_ONLINE) + core_count++; + } + } +#else + /* other platforms */ +#endif + + return core_count > 0 ? core_count : 1; +} +#endif + + +void vp8_machine_specific_config(VP8_COMMON *ctx) +{ +#if CONFIG_MULTITHREAD + ctx->processor_core_count = get_cpu_count(); +#endif /* CONFIG_MULTITHREAD */ + +#if ARCH_ARM + ctx->cpu_caps = arm_cpu_caps(); +#elif ARCH_X86 || ARCH_X86_64 + ctx->cpu_caps = x86_simd_caps(); +#endif +} diff --git a/vp8/common/header.h b/vp8/common/header.h new file mode 100644 index 0000000000000000000000000000000000000000..3e98eeb3c344e78f46f9f31ceda6781232170dd2 --- /dev/null +++ b/vp8/common/header.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef __INC_HEADER_H +#define __INC_HEADER_H + +/* 24 bits total */ +typedef struct +{ + unsigned int type: 1; + unsigned int version: 3; + unsigned int show_frame: 1; + + /* Allow 2^20 bytes = 8 megabits for first partition */ + + unsigned int first_partition_length_in_bytes: 19; + +#ifdef PACKET_TESTING + unsigned int frame_number; + unsigned int update_gold: 1; + unsigned int uses_gold: 1; + unsigned int update_last: 1; + unsigned int uses_last: 1; +#endif + +} VP8_HEADER; + +#ifdef PACKET_TESTING +#define VP8_HEADER_SIZE 8 +#else +#define VP8_HEADER_SIZE 3 +#endif + + +#endif diff --git a/vp8/common/idct_blk.c b/vp8/common/idct_blk.c new file mode 100644 index 0000000000000000000000000000000000000000..0b058c76af936a5b18c95d7f88984eb4b43efc14 --- /dev/null +++ b/vp8/common/idct_blk.c @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vpx_rtcd.h" + +void vp8_dequant_idct_add_c(short *input, short *dq, + unsigned char *dest, int stride); +void vp8_dc_only_idct_add_c(short input_dc, unsigned char * pred, + int pred_stride, unsigned char *dst_ptr, + int dst_stride); + +void vp8_dequant_idct_add_y_block_c + (short *q, short *dq, + unsigned char *dst, int stride, char *eobs) +{ + int i, j; + + for (i = 0; i < 4; i++) + { + for (j = 0; j < 4; j++) + { + if (*eobs++ > 1) + vp8_dequant_idct_add_c (q, dq, dst, stride); + else + { + vp8_dc_only_idct_add_c (q[0]*dq[0], dst, stride, dst, stride); + ((int *)q)[0] = 0; + } + + q += 16; + dst += 4; + } + + dst += 4*stride - 16; + } +} + +void vp8_dequant_idct_add_uv_block_c + (short *q, short *dq, + unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) +{ + int i, j; + + for (i = 0; i < 2; i++) + { + for (j = 0; j < 2; j++) + { + if (*eobs++ > 1) + vp8_dequant_idct_add_c (q, dq, dstu, stride); + else + { + vp8_dc_only_idct_add_c (q[0]*dq[0], dstu, stride, dstu, stride); + ((int *)q)[0] = 0; + } + + q += 16; + dstu += 4; + } + + dstu += 4*stride - 8; + } + + for (i = 0; i < 2; i++) + { + for (j = 0; j < 2; j++) + { + if (*eobs++ > 1) + vp8_dequant_idct_add_c (q, dq, dstv, stride); + else + { + vp8_dc_only_idct_add_c (q[0]*dq[0], dstv, stride, dstv, stride); + ((int *)q)[0] = 0; + } + + q += 16; + dstv += 4; + } + + dstv += 4*stride - 8; + } +} diff --git a/vp8/common/idctllm.c b/vp8/common/idctllm.c new file mode 100644 index 0000000000000000000000000000000000000000..47af52f04e7503842444e288150abcafc1297600 --- /dev/null +++ b/vp8/common/idctllm.c @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +/**************************************************************************** + * Notes: + * + * This implementation makes use of 16 bit fixed point verio of two multiply + * constants: + * 1. sqrt(2) * cos (pi/8) + * 2. sqrt(2) * sin (pi/8) + * Becuase the first constant is bigger than 1, to maintain the same 16 bit + * fixed point precision as the second one, we use a trick of + * x * a = x + x*(a-1) + * so + * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). + **************************************************************************/ +static const int cospi8sqrt2minus1 = 20091; +static const int sinpi8sqrt2 = 35468; + +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, + int pred_stride, unsigned char *dst_ptr, + int dst_stride) +{ + int i; + int r, c; + int a1, b1, c1, d1; + short output[16]; + short *ip = input; + short *op = output; + int temp1, temp2; + int shortpitch = 4; + + for (i = 0; i < 4; i++) + { + a1 = ip[0] + ip[8]; + b1 = ip[0] - ip[8]; + + temp1 = (ip[4] * sinpi8sqrt2) >> 16; + temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16); + c1 = temp1 - temp2; + + temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16); + temp2 = (ip[12] * sinpi8sqrt2) >> 16; + d1 = temp1 + temp2; + + op[shortpitch*0] = a1 + d1; + op[shortpitch*3] = a1 - d1; + + op[shortpitch*1] = b1 + c1; + op[shortpitch*2] = b1 - c1; + + ip++; + op++; + } + + ip = output; + op = output; + + for (i = 0; i < 4; i++) + { + a1 = ip[0] + ip[2]; + b1 = ip[0] - ip[2]; + + temp1 = (ip[1] * sinpi8sqrt2) >> 16; + temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16); + c1 = temp1 - temp2; + + temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16); + temp2 = (ip[3] * sinpi8sqrt2) >> 16; + d1 = temp1 + temp2; + + + op[0] = (a1 + d1 + 4) >> 3; + op[3] = (a1 - d1 + 4) >> 3; + + op[1] = (b1 + c1 + 4) >> 3; + op[2] = (b1 - c1 + 4) >> 3; + + ip += shortpitch; + op += shortpitch; + } + + ip = output; + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + int a = ip[c] + pred_ptr[c] ; + + if (a < 0) + a = 0; + + if (a > 255) + a = 255; + + dst_ptr[c] = (unsigned char) a ; + } + ip += 4; + dst_ptr += dst_stride; + pred_ptr += pred_stride; + } +} + +void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, + int pred_stride, unsigned char *dst_ptr, + int dst_stride) +{ + int a1 = ((input_dc + 4) >> 3); + int r, c; + + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + int a = a1 + pred_ptr[c] ; + + if (a < 0) + a = 0; + + if (a > 255) + a = 255; + + dst_ptr[c] = (unsigned char) a ; + } + + dst_ptr += dst_stride; + pred_ptr += pred_stride; + } + +} + +void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff) +{ + short output[16]; + int i; + int a1, b1, c1, d1; + int a2, b2, c2, d2; + short *ip = input; + short *op = output; + + for (i = 0; i < 4; i++) + { + a1 = ip[0] + ip[12]; + b1 = ip[4] + ip[8]; + c1 = ip[4] - ip[8]; + d1 = ip[0] - ip[12]; + + op[0] = a1 + b1; + op[4] = c1 + d1; + op[8] = a1 - b1; + op[12] = d1 - c1; + ip++; + op++; + } + + ip = output; + op = output; + + for (i = 0; i < 4; i++) + { + a1 = ip[0] + ip[3]; + b1 = ip[1] + ip[2]; + c1 = ip[1] - ip[2]; + d1 = ip[0] - ip[3]; + + a2 = a1 + b1; + b2 = c1 + d1; + c2 = a1 - b1; + d2 = d1 - c1; + + op[0] = (a2 + 3) >> 3; + op[1] = (b2 + 3) >> 3; + op[2] = (c2 + 3) >> 3; + op[3] = (d2 + 3) >> 3; + + ip += 4; + op += 4; + } + + for(i = 0; i < 16; i++) + { + mb_dqcoeff[i * 16] = output[i]; + } +} + +void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff) +{ + int i; + int a1; + + a1 = ((input[0] + 3) >> 3); + for(i = 0; i < 16; i++) + { + mb_dqcoeff[i * 16] = a1; + } +} diff --git a/vp8/common/invtrans.h b/vp8/common/invtrans.h new file mode 100644 index 0000000000000000000000000000000000000000..d048665b9a4ebec44088330cbb01ffc8347ad369 --- /dev/null +++ b/vp8/common/invtrans.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef __INC_INVTRANS_H +#define __INC_INVTRANS_H + +#include "vpx_config.h" +#include "vpx_rtcd.h" +#include "blockd.h" +#include "onyxc_int.h" + +#if CONFIG_MULTITHREAD +#include "vpx_mem/vpx_mem.h" +#endif + +static void eob_adjust(char *eobs, short *diff) +{ + /* eob adjust.... the idct can only skip if both the dc and eob are zero */ + int js; + for(js = 0; js < 16; js++) + { + if((eobs[js] == 0) && (diff[0] != 0)) + eobs[js]++; + diff+=16; + } +} + +static void vp8_inverse_transform_mby(MACROBLOCKD *xd) +{ + short *DQC = xd->dequant_y1; + + if (xd->mode_info_context->mbmi.mode != SPLITMV) + { + /* do 2nd order transform on the dc block */ + if (xd->eobs[24] > 1) + { + vp8_short_inv_walsh4x4 + (&xd->block[24].dqcoeff[0], xd->qcoeff); + } + else + { + vp8_short_inv_walsh4x4_1 + (&xd->block[24].dqcoeff[0], xd->qcoeff); + } + eob_adjust(xd->eobs, xd->qcoeff); + + DQC = xd->dequant_y1_dc; + } + vp8_dequant_idct_add_y_block + (xd->qcoeff, DQC, + xd->dst.y_buffer, + xd->dst.y_stride, xd->eobs); +} +#endif diff --git a/vp8/common/loopfilter.c b/vp8/common/loopfilter.c new file mode 100644 index 0000000000000000000000000000000000000000..41b4f1214d5da8e52853beb5bd068e6d84c29354 --- /dev/null +++ b/vp8/common/loopfilter.c @@ -0,0 +1,679 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vpx_rtcd.h" +#include "loopfilter.h" +#include "onyxc_int.h" +#include "vpx_mem/vpx_mem.h" + +typedef unsigned char uc; + +static void lf_init_lut(loop_filter_info_n *lfi) +{ + int filt_lvl; + + for (filt_lvl = 0; filt_lvl <= MAX_LOOP_FILTER; filt_lvl++) + { + if (filt_lvl >= 40) + { + lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 2; + lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 3; + } + else if (filt_lvl >= 20) + { + lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1; + lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 2; + } + else if (filt_lvl >= 15) + { + lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1; + lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 1; + } + else + { + lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 0; + lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 0; + } + } + + lfi->mode_lf_lut[DC_PRED] = 1; + lfi->mode_lf_lut[V_PRED] = 1; + lfi->mode_lf_lut[H_PRED] = 1; + lfi->mode_lf_lut[TM_PRED] = 1; + lfi->mode_lf_lut[B_PRED] = 0; + + lfi->mode_lf_lut[ZEROMV] = 1; + lfi->mode_lf_lut[NEARESTMV] = 2; + lfi->mode_lf_lut[NEARMV] = 2; + lfi->mode_lf_lut[NEWMV] = 2; + lfi->mode_lf_lut[SPLITMV] = 3; + +} + +void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi, + int sharpness_lvl) +{ + int i; + + /* For each possible value for the loop filter fill out limits */ + for (i = 0; i <= MAX_LOOP_FILTER; i++) + { + int filt_lvl = i; + int block_inside_limit = 0; + + /* Set loop filter paramaeters that control sharpness. */ + block_inside_limit = filt_lvl >> (sharpness_lvl > 0); + block_inside_limit = block_inside_limit >> (sharpness_lvl > 4); + + if (sharpness_lvl > 0) + { + if (block_inside_limit > (9 - sharpness_lvl)) + block_inside_limit = (9 - sharpness_lvl); + } + + if (block_inside_limit < 1) + block_inside_limit = 1; + + vpx_memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH); + vpx_memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit), + SIMD_WIDTH); + vpx_memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit), + SIMD_WIDTH); + } +} + +void vp8_loop_filter_init(VP8_COMMON *cm) +{ + loop_filter_info_n *lfi = &cm->lf_info; + int i; + + /* init limits for given sharpness*/ + vp8_loop_filter_update_sharpness(lfi, cm->sharpness_level); + cm->last_sharpness_level = cm->sharpness_level; + + /* init LUT for lvl and hev thr picking */ + lf_init_lut(lfi); + + /* init hev threshold const vectors */ + for(i = 0; i < 4 ; i++) + { + vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH); + } +} + +void vp8_loop_filter_frame_init(VP8_COMMON *cm, + MACROBLOCKD *mbd, + int default_filt_lvl) +{ + int seg, /* segment number */ + ref, /* index in ref_lf_deltas */ + mode; /* index in mode_lf_deltas */ + + loop_filter_info_n *lfi = &cm->lf_info; + + /* update limits if sharpness has changed */ + if(cm->last_sharpness_level != cm->sharpness_level) + { + vp8_loop_filter_update_sharpness(lfi, cm->sharpness_level); + cm->last_sharpness_level = cm->sharpness_level; + } + + for(seg = 0; seg < MAX_MB_SEGMENTS; seg++) + { + int lvl_seg = default_filt_lvl; + int lvl_ref, lvl_mode; + + /* Note the baseline filter values for each segment */ + if (mbd->segmentation_enabled) + { + /* Abs value */ + if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) + { + lvl_seg = mbd->segment_feature_data[MB_LVL_ALT_LF][seg]; + } + else /* Delta Value */ + { + lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg]; + lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63: lvl_seg) : 0; + } + } + + if (!mbd->mode_ref_lf_delta_enabled) + { + /* we could get rid of this if we assume that deltas are set to + * zero when not in use; encoder always uses deltas + */ + vpx_memset(lfi->lvl[seg][0], lvl_seg, 4 * 4 ); + continue; + } + + lvl_ref = lvl_seg; + + /* INTRA_FRAME */ + ref = INTRA_FRAME; + + /* Apply delta for reference frame */ + lvl_ref += mbd->ref_lf_deltas[ref]; + + /* Apply delta for Intra modes */ + mode = 0; /* B_PRED */ + /* Only the split mode BPRED has a further special case */ + lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode]; + lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */ + + lfi->lvl[seg][ref][mode] = lvl_mode; + + mode = 1; /* all the rest of Intra modes */ + lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref) : 0; /* clamp */ + lfi->lvl[seg][ref][mode] = lvl_mode; + + /* LAST, GOLDEN, ALT */ + for(ref = 1; ref < MAX_REF_FRAMES; ref++) + { + int lvl_ref = lvl_seg; + + /* Apply delta for reference frame */ + lvl_ref += mbd->ref_lf_deltas[ref]; + + /* Apply delta for Inter modes */ + for (mode = 1; mode < 4; mode++) + { + lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode]; + lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */ + + lfi->lvl[seg][ref][mode] = lvl_mode; + } + } + } +} + + +void vp8_loop_filter_row_normal(VP8_COMMON *cm, MODE_INFO *mode_info_context, + int mb_row, int post_ystride, int post_uvstride, + unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr) +{ + int mb_col; + int filter_level; + loop_filter_info_n *lfi_n = &cm->lf_info; + loop_filter_info lfi; + FRAME_TYPE frame_type = cm->frame_type; + + for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) + { + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); + + const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; + + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + + if (filter_level) + { + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; + + if (mb_col > 0) + vp8_loop_filter_mbv + (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi); + + if (!skip_lf) + vp8_loop_filter_bv + (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi); + + /* don't apply across umv border */ + if (mb_row > 0) + vp8_loop_filter_mbh + (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi); + + if (!skip_lf) + vp8_loop_filter_bh + (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi); + } + + y_ptr += 16; + u_ptr += 8; + v_ptr += 8; + + mode_info_context++; /* step to next MB */ + } + +} + +void vp8_loop_filter_row_simple(VP8_COMMON *cm, MODE_INFO *mode_info_context, + int mb_row, int post_ystride, int post_uvstride, + unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr) +{ + int mb_col; + int filter_level; + loop_filter_info_n *lfi_n = &cm->lf_info; + + for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) + { + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); + + const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; + + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + + if (filter_level) + { + if (mb_col > 0) + vp8_loop_filter_simple_mbv + (y_ptr, post_ystride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp8_loop_filter_simple_bv + (y_ptr, post_ystride, lfi_n->blim[filter_level]); + + /* don't apply across umv border */ + if (mb_row > 0) + vp8_loop_filter_simple_mbh + (y_ptr, post_ystride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp8_loop_filter_simple_bh + (y_ptr, post_ystride, lfi_n->blim[filter_level]); + } + + y_ptr += 16; + u_ptr += 8; + v_ptr += 8; + + mode_info_context++; /* step to next MB */ + } + +} +void vp8_loop_filter_frame(VP8_COMMON *cm, + MACROBLOCKD *mbd, + int frame_type) +{ + YV12_BUFFER_CONFIG *post = cm->frame_to_show; + loop_filter_info_n *lfi_n = &cm->lf_info; + loop_filter_info lfi; + + int mb_row; + int mb_col; + int mb_rows = cm->mb_rows; + int mb_cols = cm->mb_cols; + + int filter_level; + + unsigned char *y_ptr, *u_ptr, *v_ptr; + + /* Point at base of Mb MODE_INFO list */ + const MODE_INFO *mode_info_context = cm->mi; + int post_y_stride = post->y_stride; + int post_uv_stride = post->uv_stride; + + /* Initialize the loop filter for this frame. */ + vp8_loop_filter_frame_init(cm, mbd, cm->filter_level); + + /* Set up the buffer pointers */ + y_ptr = post->y_buffer; + u_ptr = post->u_buffer; + v_ptr = post->v_buffer; + + /* vp8_filter each macro block */ + if (cm->filter_type == NORMAL_LOOPFILTER) + { + for (mb_row = 0; mb_row < mb_rows; mb_row++) + { + for (mb_col = 0; mb_col < mb_cols; mb_col++) + { + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); + + const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; + + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + + if (filter_level) + { + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; + + if (mb_col > 0) + vp8_loop_filter_mbv + (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi); + + if (!skip_lf) + vp8_loop_filter_bv + (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi); + + /* don't apply across umv border */ + if (mb_row > 0) + vp8_loop_filter_mbh + (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi); + + if (!skip_lf) + vp8_loop_filter_bh + (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi); + } + + y_ptr += 16; + u_ptr += 8; + v_ptr += 8; + + mode_info_context++; /* step to next MB */ + } + y_ptr += post_y_stride * 16 - post->y_width; + u_ptr += post_uv_stride * 8 - post->uv_width; + v_ptr += post_uv_stride * 8 - post->uv_width; + + mode_info_context++; /* Skip border mb */ + + } + } + else /* SIMPLE_LOOPFILTER */ + { + for (mb_row = 0; mb_row < mb_rows; mb_row++) + { + for (mb_col = 0; mb_col < mb_cols; mb_col++) + { + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); + + const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; + + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + if (filter_level) + { + const unsigned char * mblim = lfi_n->mblim[filter_level]; + const unsigned char * blim = lfi_n->blim[filter_level]; + + if (mb_col > 0) + vp8_loop_filter_simple_mbv + (y_ptr, post_y_stride, mblim); + + if (!skip_lf) + vp8_loop_filter_simple_bv + (y_ptr, post_y_stride, blim); + + /* don't apply across umv border */ + if (mb_row > 0) + vp8_loop_filter_simple_mbh + (y_ptr, post_y_stride, mblim); + + if (!skip_lf) + vp8_loop_filter_simple_bh + (y_ptr, post_y_stride, blim); + } + + y_ptr += 16; + u_ptr += 8; + v_ptr += 8; + + mode_info_context++; /* step to next MB */ + } + y_ptr += post_y_stride * 16 - post->y_width; + u_ptr += post_uv_stride * 8 - post->uv_width; + v_ptr += post_uv_stride * 8 - post->uv_width; + + mode_info_context++; /* Skip border mb */ + + } + } +} + +void vp8_loop_filter_frame_yonly +( + VP8_COMMON *cm, + MACROBLOCKD *mbd, + int default_filt_lvl +) +{ + YV12_BUFFER_CONFIG *post = cm->frame_to_show; + + unsigned char *y_ptr; + int mb_row; + int mb_col; + + loop_filter_info_n *lfi_n = &cm->lf_info; + loop_filter_info lfi; + + int filter_level; + FRAME_TYPE frame_type = cm->frame_type; + + /* Point at base of Mb MODE_INFO list */ + const MODE_INFO *mode_info_context = cm->mi; + +#if 0 + if(default_filt_lvl == 0) /* no filter applied */ + return; +#endif + + /* Initialize the loop filter for this frame. */ + vp8_loop_filter_frame_init( cm, mbd, default_filt_lvl); + + /* Set up the buffer pointers */ + y_ptr = post->y_buffer; + + /* vp8_filter each macro block */ + for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) + { + for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) + { + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); + + const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; + + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + + if (filter_level) + { + if (cm->filter_type == NORMAL_LOOPFILTER) + { + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; + + if (mb_col > 0) + vp8_loop_filter_mbv + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + + if (!skip_lf) + vp8_loop_filter_bv + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + + /* don't apply across umv border */ + if (mb_row > 0) + vp8_loop_filter_mbh + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + + if (!skip_lf) + vp8_loop_filter_bh + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + } + else + { + if (mb_col > 0) + vp8_loop_filter_simple_mbv + (y_ptr, post->y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp8_loop_filter_simple_bv + (y_ptr, post->y_stride, lfi_n->blim[filter_level]); + + /* don't apply across umv border */ + if (mb_row > 0) + vp8_loop_filter_simple_mbh + (y_ptr, post->y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp8_loop_filter_simple_bh + (y_ptr, post->y_stride, lfi_n->blim[filter_level]); + } + } + + y_ptr += 16; + mode_info_context ++; /* step to next MB */ + + } + + y_ptr += post->y_stride * 16 - post->y_width; + mode_info_context ++; /* Skip border mb */ + } + +} + +void vp8_loop_filter_partial_frame +( + VP8_COMMON *cm, + MACROBLOCKD *mbd, + int default_filt_lvl +) +{ + YV12_BUFFER_CONFIG *post = cm->frame_to_show; + + unsigned char *y_ptr; + int mb_row; + int mb_col; + int mb_cols = post->y_width >> 4; + int mb_rows = post->y_height >> 4; + + int linestocopy, i; + + loop_filter_info_n *lfi_n = &cm->lf_info; + loop_filter_info lfi; + + int filter_level; + int alt_flt_enabled = mbd->segmentation_enabled; + FRAME_TYPE frame_type = cm->frame_type; + + const MODE_INFO *mode_info_context; + + int lvl_seg[MAX_MB_SEGMENTS]; + + /* number of MB rows to use in partial filtering */ + linestocopy = mb_rows / PARTIAL_FRAME_FRACTION; + linestocopy = linestocopy ? linestocopy << 4 : 16; /* 16 lines per MB */ + + /* Note the baseline filter values for each segment */ + /* See vp8_loop_filter_frame_init. Rather than call that for each change + * to default_filt_lvl, copy the relevant calculation here. + */ + if (alt_flt_enabled) + { + for (i = 0; i < MAX_MB_SEGMENTS; i++) + { /* Abs value */ + if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) + { + lvl_seg[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i]; + } + /* Delta Value */ + else + { + lvl_seg[i] = default_filt_lvl + + mbd->segment_feature_data[MB_LVL_ALT_LF][i]; + lvl_seg[i] = (lvl_seg[i] > 0) ? + ((lvl_seg[i] > 63) ? 63: lvl_seg[i]) : 0; + } + } + } + + /* Set up the buffer pointers; partial image starts at ~middle of frame */ + y_ptr = post->y_buffer + ((post->y_height >> 5) * 16) * post->y_stride; + mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1); + + /* vp8_filter each macro block */ + for (mb_row = 0; mb_row<(linestocopy >> 4); mb_row++) + { + for (mb_col = 0; mb_col < mb_cols; mb_col++) + { + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); + + if (alt_flt_enabled) + filter_level = lvl_seg[mode_info_context->mbmi.segment_id]; + else + filter_level = default_filt_lvl; + + if (filter_level) + { + if (cm->filter_type == NORMAL_LOOPFILTER) + { + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; + + if (mb_col > 0) + vp8_loop_filter_mbv + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + + if (!skip_lf) + vp8_loop_filter_bv + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + + vp8_loop_filter_mbh + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + + if (!skip_lf) + vp8_loop_filter_bh + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + } + else + { + if (mb_col > 0) + vp8_loop_filter_simple_mbv + (y_ptr, post->y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp8_loop_filter_simple_bv + (y_ptr, post->y_stride, lfi_n->blim[filter_level]); + + vp8_loop_filter_simple_mbh + (y_ptr, post->y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp8_loop_filter_simple_bh + (y_ptr, post->y_stride, lfi_n->blim[filter_level]); + } + } + + y_ptr += 16; + mode_info_context += 1; /* step to next MB */ + } + + y_ptr += post->y_stride * 16 - post->y_width; + mode_info_context += 1; /* Skip border mb */ + } +} diff --git a/vp8/common/loopfilter.h b/vp8/common/loopfilter.h new file mode 100644 index 0000000000000000000000000000000000000000..b3af2d65007137885cdb9e04f9a477aebd38aefc --- /dev/null +++ b/vp8/common/loopfilter.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef loopfilter_h +#define loopfilter_h + +#include "vpx_ports/mem.h" +#include "vpx_config.h" +#include "vpx_rtcd.h" + +#define MAX_LOOP_FILTER 63 +/* fraction of total macroblock rows to be used in fast filter level picking */ +/* has to be > 2 */ +#define PARTIAL_FRAME_FRACTION 8 + +typedef enum +{ + NORMAL_LOOPFILTER = 0, + SIMPLE_LOOPFILTER = 1 +} LOOPFILTERTYPE; + +#if ARCH_ARM +#define SIMD_WIDTH 1 +#else +#define SIMD_WIDTH 16 +#endif + +/* Need to align this structure so when it is declared and + * passed it can be loaded into vector registers. + */ +typedef struct +{ + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, blim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, hev_thr[4][SIMD_WIDTH]); + unsigned char lvl[4][4][4]; + unsigned char hev_thr_lut[2][MAX_LOOP_FILTER + 1]; + unsigned char mode_lf_lut[10]; +} loop_filter_info_n; + +typedef struct loop_filter_info +{ + const unsigned char * mblim; + const unsigned char * blim; + const unsigned char * lim; + const unsigned char * hev_thr; +} loop_filter_info; + + +typedef void loop_filter_uvfunction +( + unsigned char *u, /* source pointer */ + int p, /* pitch */ + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + unsigned char *v +); + +/* assorted loopfilter functions which get used elsewhere */ +struct VP8Common; +struct macroblockd; +struct modeinfo; + +void vp8_loop_filter_init(struct VP8Common *cm); + +void vp8_loop_filter_frame_init(struct VP8Common *cm, + struct macroblockd *mbd, + int default_filt_lvl); + +void vp8_loop_filter_frame(struct VP8Common *cm, struct macroblockd *mbd, + int frame_type); + +void vp8_loop_filter_partial_frame(struct VP8Common *cm, + struct macroblockd *mbd, + int default_filt_lvl); + +void vp8_loop_filter_frame_yonly(struct VP8Common *cm, + struct macroblockd *mbd, + int default_filt_lvl); + +void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi, + int sharpness_lvl); + +void vp8_loop_filter_row_normal(struct VP8Common *cm, + struct modeinfo *mode_info_context, + int mb_row, int post_ystride, int post_uvstride, + unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr); + +void vp8_loop_filter_row_simple(struct VP8Common *cm, + struct modeinfo *mode_info_context, + int mb_row, int post_ystride, int post_uvstride, + unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr); +#endif diff --git a/vp8/common/loopfilter_filters.c b/vp8/common/loopfilter_filters.c new file mode 100644 index 0000000000000000000000000000000000000000..8235f6e9f8cb5282397b55c484e9509c585636cb --- /dev/null +++ b/vp8/common/loopfilter_filters.c @@ -0,0 +1,430 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include +#include "loopfilter.h" +#include "onyxc_int.h" + +typedef unsigned char uc; + +static signed char vp8_signed_char_clamp(int t) +{ + t = (t < -128 ? -128 : t); + t = (t > 127 ? 127 : t); + return (signed char) t; +} + + +/* should we apply any filter at all ( 11111111 yes, 00000000 no) */ +static signed char vp8_filter_mask(uc limit, uc blimit, + uc p3, uc p2, uc p1, uc p0, + uc q0, uc q1, uc q2, uc q3) +{ + signed char mask = 0; + mask |= (abs(p3 - p2) > limit); + mask |= (abs(p2 - p1) > limit); + mask |= (abs(p1 - p0) > limit); + mask |= (abs(q1 - q0) > limit); + mask |= (abs(q2 - q1) > limit); + mask |= (abs(q3 - q2) > limit); + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit); + return mask - 1; +} + +/* is there high variance internal edge ( 11111111 yes, 00000000 no) */ +static signed char vp8_hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1) +{ + signed char hev = 0; + hev |= (abs(p1 - p0) > thresh) * -1; + hev |= (abs(q1 - q0) > thresh) * -1; + return hev; +} + +static void vp8_filter(signed char mask, uc hev, uc *op1, + uc *op0, uc *oq0, uc *oq1) + +{ + signed char ps0, qs0; + signed char ps1, qs1; + signed char vp8_filter, Filter1, Filter2; + signed char u; + + ps1 = (signed char) * op1 ^ 0x80; + ps0 = (signed char) * op0 ^ 0x80; + qs0 = (signed char) * oq0 ^ 0x80; + qs1 = (signed char) * oq1 ^ 0x80; + + /* add outer taps if we have high edge variance */ + vp8_filter = vp8_signed_char_clamp(ps1 - qs1); + vp8_filter &= hev; + + /* inner taps */ + vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); + vp8_filter &= mask; + + /* save bottom 3 bits so that we round one side +4 and the other +3 + * if it equals 4 we'll set to adjust by -1 to account for the fact + * we'd round 3 the other way + */ + Filter1 = vp8_signed_char_clamp(vp8_filter + 4); + Filter2 = vp8_signed_char_clamp(vp8_filter + 3); + Filter1 >>= 3; + Filter2 >>= 3; + u = vp8_signed_char_clamp(qs0 - Filter1); + *oq0 = u ^ 0x80; + u = vp8_signed_char_clamp(ps0 + Filter2); + *op0 = u ^ 0x80; + vp8_filter = Filter1; + + /* outer tap adjustments */ + vp8_filter += 1; + vp8_filter >>= 1; + vp8_filter &= ~hev; + + u = vp8_signed_char_clamp(qs1 - vp8_filter); + *oq1 = u ^ 0x80; + u = vp8_signed_char_clamp(ps1 + vp8_filter); + *op1 = u ^ 0x80; + +} +void vp8_loop_filter_horizontal_edge_c +( + unsigned char *s, + int p, /* pitch */ + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count +) +{ + int hev = 0; /* high edge variance */ + signed char mask = 0; + int i = 0; + + /* loop filter designed to work using chars so that we can make maximum use + * of 8 bit simd instructions. + */ + do + { + mask = vp8_filter_mask(limit[0], blimit[0], + s[-4*p], s[-3*p], s[-2*p], s[-1*p], + s[0*p], s[1*p], s[2*p], s[3*p]); + + hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]); + + vp8_filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p); + + ++s; + } + while (++i < count * 8); +} + +void vp8_loop_filter_vertical_edge_c +( + unsigned char *s, + int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count +) +{ + int hev = 0; /* high edge variance */ + signed char mask = 0; + int i = 0; + + /* loop filter designed to work using chars so that we can make maximum use + * of 8 bit simd instructions. + */ + do + { + mask = vp8_filter_mask(limit[0], blimit[0], + s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]); + + hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]); + + vp8_filter(mask, hev, s - 2, s - 1, s, s + 1); + + s += p; + } + while (++i < count * 8); +} + +static void vp8_mbfilter(signed char mask, uc hev, + uc *op2, uc *op1, uc *op0, uc *oq0, uc *oq1, uc *oq2) +{ + signed char s, u; + signed char vp8_filter, Filter1, Filter2; + signed char ps2 = (signed char) * op2 ^ 0x80; + signed char ps1 = (signed char) * op1 ^ 0x80; + signed char ps0 = (signed char) * op0 ^ 0x80; + signed char qs0 = (signed char) * oq0 ^ 0x80; + signed char qs1 = (signed char) * oq1 ^ 0x80; + signed char qs2 = (signed char) * oq2 ^ 0x80; + + /* add outer taps if we have high edge variance */ + vp8_filter = vp8_signed_char_clamp(ps1 - qs1); + vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); + vp8_filter &= mask; + + Filter2 = vp8_filter; + Filter2 &= hev; + + /* save bottom 3 bits so that we round one side +4 and the other +3 */ + Filter1 = vp8_signed_char_clamp(Filter2 + 4); + Filter2 = vp8_signed_char_clamp(Filter2 + 3); + Filter1 >>= 3; + Filter2 >>= 3; + qs0 = vp8_signed_char_clamp(qs0 - Filter1); + ps0 = vp8_signed_char_clamp(ps0 + Filter2); + + + /* only apply wider filter if not high edge variance */ + vp8_filter &= ~hev; + Filter2 = vp8_filter; + + /* roughly 3/7th difference across boundary */ + u = vp8_signed_char_clamp((63 + Filter2 * 27) >> 7); + s = vp8_signed_char_clamp(qs0 - u); + *oq0 = s ^ 0x80; + s = vp8_signed_char_clamp(ps0 + u); + *op0 = s ^ 0x80; + + /* roughly 2/7th difference across boundary */ + u = vp8_signed_char_clamp((63 + Filter2 * 18) >> 7); + s = vp8_signed_char_clamp(qs1 - u); + *oq1 = s ^ 0x80; + s = vp8_signed_char_clamp(ps1 + u); + *op1 = s ^ 0x80; + + /* roughly 1/7th difference across boundary */ + u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7); + s = vp8_signed_char_clamp(qs2 - u); + *oq2 = s ^ 0x80; + s = vp8_signed_char_clamp(ps2 + u); + *op2 = s ^ 0x80; +} + +void vp8_mbloop_filter_horizontal_edge_c +( + unsigned char *s, + int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count +) +{ + signed char hev = 0; /* high edge variance */ + signed char mask = 0; + int i = 0; + + /* loop filter designed to work using chars so that we can make maximum use + * of 8 bit simd instructions. + */ + do + { + + mask = vp8_filter_mask(limit[0], blimit[0], + s[-4*p], s[-3*p], s[-2*p], s[-1*p], + s[0*p], s[1*p], s[2*p], s[3*p]); + + hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]); + + vp8_mbfilter(mask, hev, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p); + + ++s; + } + while (++i < count * 8); + +} + + +void vp8_mbloop_filter_vertical_edge_c +( + unsigned char *s, + int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count +) +{ + signed char hev = 0; /* high edge variance */ + signed char mask = 0; + int i = 0; + + do + { + + mask = vp8_filter_mask(limit[0], blimit[0], + s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]); + + hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]); + + vp8_mbfilter(mask, hev, s - 3, s - 2, s - 1, s, s + 1, s + 2); + + s += p; + } + while (++i < count * 8); + +} + +/* should we apply any filter at all ( 11111111 yes, 00000000 no) */ +static signed char vp8_simple_filter_mask(uc blimit, uc p1, uc p0, uc q0, uc q1) +{ +/* Why does this cause problems for win32? + * error C2143: syntax error : missing ';' before 'type' + * (void) limit; + */ + signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= blimit) * -1; + return mask; +} + +static void vp8_simple_filter(signed char mask, uc *op1, uc *op0, uc *oq0, uc *oq1) +{ + signed char vp8_filter, Filter1, Filter2; + signed char p1 = (signed char) * op1 ^ 0x80; + signed char p0 = (signed char) * op0 ^ 0x80; + signed char q0 = (signed char) * oq0 ^ 0x80; + signed char q1 = (signed char) * oq1 ^ 0x80; + signed char u; + + vp8_filter = vp8_signed_char_clamp(p1 - q1); + vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (q0 - p0)); + vp8_filter &= mask; + + /* save bottom 3 bits so that we round one side +4 and the other +3 */ + Filter1 = vp8_signed_char_clamp(vp8_filter + 4); + Filter1 >>= 3; + u = vp8_signed_char_clamp(q0 - Filter1); + *oq0 = u ^ 0x80; + + Filter2 = vp8_signed_char_clamp(vp8_filter + 3); + Filter2 >>= 3; + u = vp8_signed_char_clamp(p0 + Filter2); + *op0 = u ^ 0x80; +} + +void vp8_loop_filter_simple_horizontal_edge_c +( + unsigned char *s, + int p, + const unsigned char *blimit +) +{ + signed char mask = 0; + int i = 0; + + do + { + mask = vp8_simple_filter_mask(blimit[0], s[-2*p], s[-1*p], s[0*p], s[1*p]); + vp8_simple_filter(mask, s - 2 * p, s - 1 * p, s, s + 1 * p); + ++s; + } + while (++i < 16); +} + +void vp8_loop_filter_simple_vertical_edge_c +( + unsigned char *s, + int p, + const unsigned char *blimit +) +{ + signed char mask = 0; + int i = 0; + + do + { + mask = vp8_simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]); + vp8_simple_filter(mask, s - 2, s - 1, s, s + 1); + s += p; + } + while (++i < 16); + +} + +/* Horizontal MB filtering */ +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) +{ + vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); +} + +/* Vertical MB Filtering */ +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) +{ + vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); +} + +/* Horizontal B Filtering */ +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) +{ + vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); +} + +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) +{ + vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, blimit); +} + +/* Vertical B Filtering */ +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) +{ + vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); +} + +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) +{ + vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit); +} diff --git a/vp8/common/mbpitch.c b/vp8/common/mbpitch.c new file mode 100644 index 0000000000000000000000000000000000000000..32e1b66409f685ef93e51a5f07eb58f0fbed82f8 --- /dev/null +++ b/vp8/common/mbpitch.c @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "blockd.h" + +void vp8_setup_block_dptrs(MACROBLOCKD *x) +{ + int r, c; + + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + x->block[r*4+c].predictor = x->predictor + r * 4 * 16 + c * 4; + } + } + + for (r = 0; r < 2; r++) + { + for (c = 0; c < 2; c++) + { + x->block[16+r*2+c].predictor = x->predictor + 256 + r * 4 * 8 + c * 4; + + } + } + + for (r = 0; r < 2; r++) + { + for (c = 0; c < 2; c++) + { + x->block[20+r*2+c].predictor = x->predictor + 320 + r * 4 * 8 + c * 4; + + } + } + + for (r = 0; r < 25; r++) + { + x->block[r].qcoeff = x->qcoeff + r * 16; + x->block[r].dqcoeff = x->dqcoeff + r * 16; + x->block[r].eob = x->eobs + r; + } +} + +void vp8_build_block_doffsets(MACROBLOCKD *x) +{ + int block; + + for (block = 0; block < 16; block++) /* y blocks */ + { + x->block[block].offset = + (block >> 2) * 4 * x->dst.y_stride + (block & 3) * 4; + } + + for (block = 16; block < 20; block++) /* U and V blocks */ + { + x->block[block+4].offset = + x->block[block].offset = + ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4; + } +} diff --git a/vp8/common/mfqe.c b/vp8/common/mfqe.c new file mode 100644 index 0000000000000000000000000000000000000000..3dff1505b04e05367f7a101d462534d4675bf877 --- /dev/null +++ b/vp8/common/mfqe.c @@ -0,0 +1,385 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +/* MFQE: Multiframe Quality Enhancement + * In rate limited situations keyframes may cause significant visual artifacts + * commonly referred to as "popping." This file implements a postproccesing + * algorithm which blends data from the preceeding frame when there is no + * motion and the q from the previous frame is lower which indicates that it is + * higher quality. + */ + +#include "postproc.h" +#include "variance.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_rtcd.h" +#include "vpx_scale/yv12config.h" + +#include +#include + +static void filter_by_weight(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride, + int block_size, int src_weight) +{ + int dst_weight = (1 << MFQE_PRECISION) - src_weight; + int rounding_bit = 1 << (MFQE_PRECISION - 1); + int r, c; + + for (r = 0; r < block_size; r++) + { + for (c = 0; c < block_size; c++) + { + dst[c] = (src[c] * src_weight + + dst[c] * dst_weight + + rounding_bit) >> MFQE_PRECISION; + } + src += src_stride; + dst += dst_stride; + } +} + +void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride, + int src_weight) +{ + filter_by_weight(src, src_stride, dst, dst_stride, 16, src_weight); +} + +void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride, + int src_weight) +{ + filter_by_weight(src, src_stride, dst, dst_stride, 8, src_weight); +} + +void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride, + int src_weight) +{ + filter_by_weight(src, src_stride, dst, dst_stride, 4, src_weight); +} + +static void apply_ifactor(unsigned char *y_src, + int y_src_stride, + unsigned char *y_dst, + int y_dst_stride, + unsigned char *u_src, + unsigned char *v_src, + int uv_src_stride, + unsigned char *u_dst, + unsigned char *v_dst, + int uv_dst_stride, + int block_size, + int src_weight) +{ + if (block_size == 16) + { + vp8_filter_by_weight16x16(y_src, y_src_stride, y_dst, y_dst_stride, src_weight); + vp8_filter_by_weight8x8(u_src, uv_src_stride, u_dst, uv_dst_stride, src_weight); + vp8_filter_by_weight8x8(v_src, uv_src_stride, v_dst, uv_dst_stride, src_weight); + } + else /* if (block_size == 8) */ + { + vp8_filter_by_weight8x8(y_src, y_src_stride, y_dst, y_dst_stride, src_weight); + vp8_filter_by_weight4x4(u_src, uv_src_stride, u_dst, uv_dst_stride, src_weight); + vp8_filter_by_weight4x4(v_src, uv_src_stride, v_dst, uv_dst_stride, src_weight); + } +} + +static unsigned int int_sqrt(unsigned int x) +{ + unsigned int y = x; + unsigned int guess; + int p = 1; + while (y>>=1) p++; + p>>=1; + + guess=0; + while (p>=0) + { + guess |= (1<> 1; + int qdiff = qcurr - qprev; + + int i; + unsigned char *up; + unsigned char *udp; + unsigned char *vp; + unsigned char *vdp; + + unsigned int act, actd, sad, usad, vsad, sse, thr, thrsq, actrisk; + + if (blksize == 16) + { + actd = (vp8_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8; + act = (vp8_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8; +#ifdef USE_SSD + sad = (vp8_variance16x16(y, y_stride, yd, yd_stride, &sse)); + sad = (sse + 128)>>8; + usad = (vp8_variance8x8(u, uv_stride, ud, uvd_stride, &sse)); + usad = (sse + 32)>>6; + vsad = (vp8_variance8x8(v, uv_stride, vd, uvd_stride, &sse)); + vsad = (sse + 32)>>6; +#else + sad = (vp8_sad16x16(y, y_stride, yd, yd_stride, UINT_MAX) + 128) >> 8; + usad = (vp8_sad8x8(u, uv_stride, ud, uvd_stride, UINT_MAX) + 32) >> 6; + vsad = (vp8_sad8x8(v, uv_stride, vd, uvd_stride, UINT_MAX)+ 32) >> 6; +#endif + } + else /* if (blksize == 8) */ + { + actd = (vp8_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6; + act = (vp8_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6; +#ifdef USE_SSD + sad = (vp8_variance8x8(y, y_stride, yd, yd_stride, &sse)); + sad = (sse + 32)>>6; + usad = (vp8_variance4x4(u, uv_stride, ud, uvd_stride, &sse)); + usad = (sse + 8)>>4; + vsad = (vp8_variance4x4(v, uv_stride, vd, uvd_stride, &sse)); + vsad = (sse + 8)>>4; +#else + sad = (vp8_sad8x8(y, y_stride, yd, yd_stride, UINT_MAX) + 32) >> 6; + usad = (vp8_sad4x4(u, uv_stride, ud, uvd_stride, UINT_MAX) + 8) >> 4; + vsad = (vp8_sad4x4(v, uv_stride, vd, uvd_stride, UINT_MAX) + 8) >> 4; +#endif + } + + actrisk = (actd > act * 5); + + /* thr = qdiff/16 + log2(act) + log4(qprev) */ + thr = (qdiff >> 4); + while (actd >>= 1) thr++; + while (qprev >>= 2) thr++; + +#ifdef USE_SSD + thrsq = thr * thr; + if (sad < thrsq && + /* additional checks for color mismatch and excessive addition of + * high-frequencies */ + 4 * usad < thrsq && 4 * vsad < thrsq && !actrisk) +#else + if (sad < thr && + /* additional checks for color mismatch and excessive addition of + * high-frequencies */ + 2 * usad < thr && 2 * vsad < thr && !actrisk) +#endif + { + int ifactor; +#ifdef USE_SSD + /* TODO: optimize this later to not need sqr root */ + sad = int_sqrt(sad); +#endif + ifactor = (sad << MFQE_PRECISION) / thr; + ifactor >>= (qdiff >> 5); + + if (ifactor) + { + apply_ifactor(y, y_stride, yd, yd_stride, + u, v, uv_stride, + ud, vd, uvd_stride, + blksize, ifactor); + } + } + else /* else implicitly copy from previous frame */ + { + if (blksize == 16) + { + vp8_copy_mem16x16(y, y_stride, yd, yd_stride); + vp8_copy_mem8x8(u, uv_stride, ud, uvd_stride); + vp8_copy_mem8x8(v, uv_stride, vd, uvd_stride); + } + else /* if (blksize == 8) */ + { + vp8_copy_mem8x8(y, y_stride, yd, yd_stride); + for (up = u, udp = ud, i = 0; i < uvblksize; ++i, up += uv_stride, udp += uvd_stride) + vpx_memcpy(udp, up, uvblksize); + for (vp = v, vdp = vd, i = 0; i < uvblksize; ++i, vp += uv_stride, vdp += uvd_stride) + vpx_memcpy(vdp, vp, uvblksize); + } + } +} + +static int qualify_inter_mb(const MODE_INFO *mode_info_context, int *map) +{ + if (mode_info_context->mbmi.mb_skip_coeff) + map[0] = map[1] = map[2] = map[3] = 1; + else if (mode_info_context->mbmi.mode==SPLITMV) + { + static int ndx[4][4] = + { + {0, 1, 4, 5}, + {2, 3, 6, 7}, + {8, 9, 12, 13}, + {10, 11, 14, 15} + }; + int i, j; + for (i=0; i<4; ++i) + { + map[i] = 1; + for (j=0; j<4 && map[j]; ++j) + map[i] &= (mode_info_context->bmi[ndx[i][j]].mv.as_mv.row <= 2 && + mode_info_context->bmi[ndx[i][j]].mv.as_mv.col <= 2); + } + } + else + { + map[0] = map[1] = map[2] = map[3] = + (mode_info_context->mbmi.mode > B_PRED && + abs(mode_info_context->mbmi.mv.as_mv.row) <= 2 && + abs(mode_info_context->mbmi.mv.as_mv.col) <= 2); + } + return (map[0]+map[1]+map[2]+map[3]); +} + +void vp8_multiframe_quality_enhance +( + VP8_COMMON *cm +) +{ + YV12_BUFFER_CONFIG *show = cm->frame_to_show; + YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer; + + FRAME_TYPE frame_type = cm->frame_type; + /* Point at base of Mb MODE_INFO list has motion vectors etc */ + const MODE_INFO *mode_info_context = cm->mi; + int mb_row; + int mb_col; + int totmap, map[4]; + int qcurr = cm->base_qindex; + int qprev = cm->postproc_state.last_base_qindex; + + unsigned char *y_ptr, *u_ptr, *v_ptr; + unsigned char *yd_ptr, *ud_ptr, *vd_ptr; + + /* Set up the buffer pointers */ + y_ptr = show->y_buffer; + u_ptr = show->u_buffer; + v_ptr = show->v_buffer; + yd_ptr = dest->y_buffer; + ud_ptr = dest->u_buffer; + vd_ptr = dest->v_buffer; + + /* postprocess each macro block */ + for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) + { + for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) + { + /* if motion is high there will likely be no benefit */ + if (frame_type == INTER_FRAME) totmap = qualify_inter_mb(mode_info_context, map); + else totmap = (frame_type == KEY_FRAME ? 4 : 0); + if (totmap) + { + if (totmap < 4) + { + int i, j; + for (i=0; i<2; ++i) + for (j=0; j<2; ++j) + { + if (map[i*2+j]) + { + multiframe_quality_enhance_block(8, qcurr, qprev, + y_ptr + 8*(i*show->y_stride+j), + u_ptr + 4*(i*show->uv_stride+j), + v_ptr + 4*(i*show->uv_stride+j), + show->y_stride, + show->uv_stride, + yd_ptr + 8*(i*dest->y_stride+j), + ud_ptr + 4*(i*dest->uv_stride+j), + vd_ptr + 4*(i*dest->uv_stride+j), + dest->y_stride, + dest->uv_stride); + } + else + { + /* copy a 8x8 block */ + int k; + unsigned char *up = u_ptr + 4*(i*show->uv_stride+j); + unsigned char *udp = ud_ptr + 4*(i*dest->uv_stride+j); + unsigned char *vp = v_ptr + 4*(i*show->uv_stride+j); + unsigned char *vdp = vd_ptr + 4*(i*dest->uv_stride+j); + vp8_copy_mem8x8(y_ptr + 8*(i*show->y_stride+j), show->y_stride, + yd_ptr + 8*(i*dest->y_stride+j), dest->y_stride); + for (k = 0; k < 4; ++k, up += show->uv_stride, udp += dest->uv_stride, + vp += show->uv_stride, vdp += dest->uv_stride) + { + vpx_memcpy(udp, up, 4); + vpx_memcpy(vdp, vp, 4); + } + } + } + } + else /* totmap = 4 */ + { + multiframe_quality_enhance_block(16, qcurr, qprev, y_ptr, + u_ptr, v_ptr, + show->y_stride, + show->uv_stride, + yd_ptr, ud_ptr, vd_ptr, + dest->y_stride, + dest->uv_stride); + } + } + else + { + vp8_copy_mem16x16(y_ptr, show->y_stride, yd_ptr, dest->y_stride); + vp8_copy_mem8x8(u_ptr, show->uv_stride, ud_ptr, dest->uv_stride); + vp8_copy_mem8x8(v_ptr, show->uv_stride, vd_ptr, dest->uv_stride); + } + y_ptr += 16; + u_ptr += 8; + v_ptr += 8; + yd_ptr += 16; + ud_ptr += 8; + vd_ptr += 8; + mode_info_context++; /* step to next MB */ + } + + y_ptr += show->y_stride * 16 - 16 * cm->mb_cols; + u_ptr += show->uv_stride * 8 - 8 * cm->mb_cols; + v_ptr += show->uv_stride * 8 - 8 * cm->mb_cols; + yd_ptr += dest->y_stride * 16 - 16 * cm->mb_cols; + ud_ptr += dest->uv_stride * 8 - 8 * cm->mb_cols; + vd_ptr += dest->uv_stride * 8 - 8 * cm->mb_cols; + + mode_info_context++; /* Skip border mb */ + } +} diff --git a/vp8/common/mips/dspr2/dequantize_dspr2.c b/vp8/common/mips/dspr2/dequantize_dspr2.c new file mode 100644 index 0000000000000000000000000000000000000000..682332573fe06c23eab71b26118caafc757c78a1 --- /dev/null +++ b/vp8/common/mips/dspr2/dequantize_dspr2.c @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vpx_rtcd.h" +#include "vpx_mem/vpx_mem.h" + +#if HAVE_DSPR2 +void vp8_dequant_idct_add_dspr2(short *input, short *dq, + unsigned char *dest, int stride) +{ + int i; + + for (i = 0; i < 16; i++) + { + input[i] = dq[i] * input[i]; + } + + vp8_short_idct4x4llm_dspr2(input, dest, stride, dest, stride); + + vpx_memset(input, 0, 32); + +} + +#endif diff --git a/vp8/common/mips/dspr2/filter_dspr2.c b/vp8/common/mips/dspr2/filter_dspr2.c new file mode 100644 index 0000000000000000000000000000000000000000..71fdcd7150ff0c4a7f6389517f33ca434817b9f8 --- /dev/null +++ b/vp8/common/mips/dspr2/filter_dspr2.c @@ -0,0 +1,2823 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include +#include "vpx_rtcd.h" +#include "vpx_ports/mem.h" + +#if HAVE_DSPR2 +#define CROP_WIDTH 256 +unsigned char ff_cropTbl[256 + 2 * CROP_WIDTH]; + +static const unsigned short sub_pel_filterss[8][3] = +{ + { 0, 0, 0}, + { 0, 0x0601, 0x7b0c}, + { 0x0201, 0x0b08, 0x6c24}, + { 0, 0x0906, 0x5d32}, + { 0x0303, 0x1010, 0x4d4d}, + { 0, 0x0609, 0x325d}, + { 0x0102, 0x080b, 0x246c}, + { 0, 0x0106, 0x0c7b}, +}; + + +static const int sub_pel_filters_int[8][3] = +{ + { 0, 0, 0}, + { 0x0000fffa, 0x007b000c, 0xffff0000}, + { 0x0002fff5, 0x006c0024, 0xfff80001}, + { 0x0000fff7, 0x005d0032, 0xfffa0000}, + { 0x0003fff0, 0x004d004d, 0xfff00003}, + { 0x0000fffa, 0x0032005d, 0xfff70000}, + { 0x0001fff8, 0x0024006c, 0xfff50002}, + { 0x0000ffff, 0x000c007b, 0xfffa0000}, +}; + + +static const int sub_pel_filters_inv[8][3] = +{ + { 0, 0, 0}, + { 0xfffa0000, 0x000c007b, 0x0000ffff}, + { 0xfff50002, 0x0024006c, 0x0001fff8}, + { 0xfff70000, 0x0032005d, 0x0000fffa}, + { 0xfff00003, 0x004d004d, 0x0003fff0}, + { 0xfffa0000, 0x005d0032, 0x0000fff7}, + { 0xfff80001, 0x006c0024, 0x0002fff5}, + { 0xffff0000, 0x007b000c, 0x0000fffa}, +}; + + +static const int sub_pel_filters_int_tap_4[8][2] = +{ + { 0, 0}, + { 0xfffa007b, 0x000cffff}, + { 0, 0}, + { 0xfff7005d, 0x0032fffa}, + { 0, 0}, + { 0xfffa0032, 0x005dfff7}, + { 0, 0}, + { 0xffff000c, 0x007bfffa}, +}; + + +static const int sub_pel_filters_inv_tap_4[8][2] = +{ + { 0, 0}, + { 0x007bfffa, 0xffff000c}, + { 0, 0}, + { 0x005dfff7, 0xfffa0032}, + { 0, 0}, + { 0x0032fffa, 0xfff7005d}, + { 0, 0}, + { 0x000cffff, 0xfffa007b}, +}; + +inline void prefetch_load(unsigned char *src) +{ + __asm__ __volatile__ ( + "pref 0, 0(%[src]) \n\t" + : + : [src] "r" (src) + ); +} + + +inline void prefetch_store(unsigned char *dst) +{ + __asm__ __volatile__ ( + "pref 1, 0(%[dst]) \n\t" + : + : [dst] "r" (dst) + ); +} + +void dsputil_static_init(void) +{ + int i; + + for (i = 0; i < 256; i++) ff_cropTbl[i + CROP_WIDTH] = i; + + for (i = 0; i < CROP_WIDTH; i++) + { + ff_cropTbl[i] = 0; + ff_cropTbl[i + CROP_WIDTH + 256] = 255; + } +} + +void vp8_filter_block2d_first_pass_4 +( + unsigned char *RESTRICT src_ptr, + unsigned char *RESTRICT dst_ptr, + unsigned int src_pixels_per_line, + unsigned int output_height, + int xoffset, + int pitch +) +{ + unsigned int i; + int Temp1, Temp2, Temp3, Temp4; + + unsigned int vector4a = 64; + int vector1b, vector2b, vector3b; + unsigned int tp1, tp2, tn1, tn2; + unsigned int p1, p2, p3; + unsigned int n1, n2, n3; + unsigned char *cm = ff_cropTbl + CROP_WIDTH; + + vector3b = sub_pel_filters_inv[xoffset][2]; + + /* if (xoffset == 0) we don't need any filtering */ + if (vector3b == 0) + { + for (i = 0; i < output_height; i++) + { + /* prefetch src_ptr data to cache memory */ + prefetch_load(src_ptr + src_pixels_per_line); + dst_ptr[0] = src_ptr[0]; + dst_ptr[1] = src_ptr[1]; + dst_ptr[2] = src_ptr[2]; + dst_ptr[3] = src_ptr[3]; + + /* next row... */ + src_ptr += src_pixels_per_line; + dst_ptr += 4; + } + } + else + { + if (vector3b > 65536) + { + /* 6 tap filter */ + + vector1b = sub_pel_filters_inv[xoffset][0]; + vector2b = sub_pel_filters_inv[xoffset][1]; + + /* prefetch src_ptr data to cache memory */ + prefetch_load(src_ptr + src_pixels_per_line); + + for (i = output_height; i--;) + { + /* apply filter with vectors pairs */ + __asm__ __volatile__ ( + "ulw %[tp1], -2(%[src_ptr]) \n\t" + "ulw %[tp2], 2(%[src_ptr]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbl %[p1], %[tp2] \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "extp %[Temp1], $ac3, 9 \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" + + /* odd 1. pixel */ + "ulw %[tn2], 3(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[n1], %[tp2] \n\t" + "preceu.ph.qbl %[n2], %[tp2] \n\t" + "preceu.ph.qbr %[n3], %[tn2] \n\t" + "extp %[Temp3], $ac2, 9 \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbl %[n1], %[tn2] \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "extp %[Temp4], $ac2, 9 \n\t" + + /* clamp */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "lbux %[tn1], %[Temp2](%[cm]) \n\t" + "lbux %[tp2], %[Temp3](%[cm]) \n\t" + "lbux %[n2], %[Temp4](%[cm]) \n\t" + + /* store bytes */ + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "sb %[tn1], 1(%[dst_ptr]) \n\t" + "sb %[tp2], 2(%[dst_ptr]) \n\t" + "sb %[n2], 3(%[dst_ptr]) \n\t" + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1), + [tn2] "=&r" (tn2), [p1] "=&r" (p1), [p2] "=&r" (p2), + [p3] "=&r" (p3), [n1] "=&r" (n1), [n2] "=&r" (n2), + [n3] "=&r" (n3), [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector4a] "r" (vector4a), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr), + [vector3b] "r" (vector3b), [src_ptr] "r" (src_ptr) + ); + + /* Next row... */ + src_ptr += src_pixels_per_line; + dst_ptr += pitch; + } + } + else + { + /* 4 tap filter */ + + vector1b = sub_pel_filters_inv_tap_4[xoffset][0]; + vector2b = sub_pel_filters_inv_tap_4[xoffset][1]; + + for (i = output_height; i--;) + { + /* apply filter with vectors pairs */ + __asm__ __volatile__ ( + "ulw %[tp1], -1(%[src_ptr]) \n\t" + "ulw %[tp2], 3(%[src_ptr]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "extp %[Temp1], $ac3, 9 \n\t" + + /* odd 1. pixel */ + "srl %[tn1], %[tp2], 8 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[n1], %[tp2] \n\t" + "preceu.ph.qbl %[n2], %[tp2] \n\t" + "preceu.ph.qbr %[n3], %[tn1] \n\t" + "extp %[Temp3], $ac2, 9 \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" + "extp %[Temp4], $ac2, 9 \n\t" + + /* clamp and store results */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "lbux %[tn1], %[Temp2](%[cm]) \n\t" + "lbux %[tp2], %[Temp3](%[cm]) \n\t" + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "sb %[tn1], 1(%[dst_ptr]) \n\t" + "lbux %[n2], %[Temp4](%[cm]) \n\t" + "sb %[tp2], 2(%[dst_ptr]) \n\t" + "sb %[n2], 3(%[dst_ptr]) \n\t" + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), + [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector4a] "r" (vector4a), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr), + [src_ptr] "r" (src_ptr) + ); + /* Next row... */ + src_ptr += src_pixels_per_line; + dst_ptr += pitch; + } + } + } +} + +void vp8_filter_block2d_first_pass_8_all +( + unsigned char *RESTRICT src_ptr, + unsigned char *RESTRICT dst_ptr, + unsigned int src_pixels_per_line, + unsigned int output_height, + int xoffset, + int pitch +) +{ + unsigned int i; + int Temp1, Temp2, Temp3, Temp4; + + unsigned int vector4a = 64; + unsigned int vector1b, vector2b, vector3b; + unsigned int tp1, tp2, tn1, tn2; + unsigned int p1, p2, p3, p4; + unsigned int n1, n2, n3, n4; + + unsigned char *cm = ff_cropTbl + CROP_WIDTH; + + /* if (xoffset == 0) we don't need any filtering */ + if (xoffset == 0) + { + for (i = 0; i < output_height; i++) + { + /* prefetch src_ptr data to cache memory */ + prefetch_load(src_ptr + src_pixels_per_line); + + dst_ptr[0] = src_ptr[0]; + dst_ptr[1] = src_ptr[1]; + dst_ptr[2] = src_ptr[2]; + dst_ptr[3] = src_ptr[3]; + dst_ptr[4] = src_ptr[4]; + dst_ptr[5] = src_ptr[5]; + dst_ptr[6] = src_ptr[6]; + dst_ptr[7] = src_ptr[7]; + + /* next row... */ + src_ptr += src_pixels_per_line; + dst_ptr += 8; + } + } + else + { + vector3b = sub_pel_filters_inv[xoffset][2]; + + if (vector3b > 65536) + { + /* 6 tap filter */ + + vector1b = sub_pel_filters_inv[xoffset][0]; + vector2b = sub_pel_filters_inv[xoffset][1]; + + for (i = output_height; i--;) + { + /* prefetch src_ptr data to cache memory */ + prefetch_load(src_ptr + src_pixels_per_line); + + /* apply filter with vectors pairs */ + __asm__ __volatile__ ( + "ulw %[tp1], -2(%[src_ptr]) \n\t" + "ulw %[tp2], 2(%[src_ptr]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbl %[p1], %[tp2] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" + + "balign %[tp2], %[tp1], 3 \n\t" + "extp %[Temp1], $ac3, 9 \n\t" + "ulw %[tn2], 3(%[src_ptr]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[n1], %[tp2] \n\t" + "preceu.ph.qbl %[n2], %[tp2] \n\t" + "preceu.ph.qbr %[n3], %[tn2] \n\t" + "extp %[Temp3], $ac2, 9 \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbl %[n1], %[tn2] \n\t" + "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "ulw %[tp1], 6(%[src_ptr]) \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[p2], %[tp1] \n\t" + "extp %[Temp4], $ac2, 9 \n\t" + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn2] "=&r" (tn2), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), + [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector4a] "r" (vector4a), [vector3b] "r" (vector3b), + [src_ptr] "r" (src_ptr) + ); + + /* clamp and store results */ + dst_ptr[0] = cm[Temp1]; + dst_ptr[1] = cm[Temp2]; + dst_ptr[2] = cm[Temp3]; + dst_ptr[3] = cm[Temp4]; + + /* next 4 pixels */ + __asm__ __volatile__ ( + /* even 3. pixel */ + "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector3b] \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbl %[p4], %[tp1] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + + "ulw %[tn1], 7(%[src_ptr]) \n\t" + "extp %[Temp1], $ac3, 9 \n\t" + + /* odd 3. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[n2], %[tn1] \n\t" + "dpa.w.ph $ac3, %[n3], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector3b] \n\t" + "extp %[Temp3], $ac2, 9 \n\t" + + /* odd 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbl %[n4], %[tn1] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n2], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + "extp %[Temp4], $ac2, 9 \n\t" + + : [tn1] "=&r" (tn1), [n2] "=&r" (n2), + [p4] "=&r" (p4), [n4] "=&r" (n4), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4) + : [tp1] "r" (tp1), [vector1b] "r" (vector1b), [p2] "r" (p2), + [vector2b] "r" (vector2b), [n1] "r" (n1), [p1] "r" (p1), + [vector4a] "r" (vector4a), [vector3b] "r" (vector3b), + [p3] "r" (p3), [n3] "r" (n3), [src_ptr] "r" (src_ptr) + ); + + /* clamp and store results */ + dst_ptr[4] = cm[Temp1]; + dst_ptr[5] = cm[Temp2]; + dst_ptr[6] = cm[Temp3]; + dst_ptr[7] = cm[Temp4]; + + src_ptr += src_pixels_per_line; + dst_ptr += pitch; + } + } + else + { + /* 4 tap filter */ + + vector1b = sub_pel_filters_inv_tap_4[xoffset][0]; + vector2b = sub_pel_filters_inv_tap_4[xoffset][1]; + + for (i = output_height; i--;) + { + /* prefetch src_ptr data to cache memory */ + prefetch_load(src_ptr + src_pixels_per_line); + + /* apply filter with vectors pairs */ + __asm__ __volatile__ ( + "ulw %[tp1], -1(%[src_ptr]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + + "ulw %[tp2], 3(%[src_ptr]) \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "extp %[Temp1], $ac3, 9 \n\t" + + "balign %[tp2], %[tp1], 3 \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[n1], %[tp2] \n\t" + "preceu.ph.qbl %[n2], %[tp2] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + "extp %[Temp3], $ac2, 9 \n\t" + + "ulw %[tn2], 4(%[src_ptr]) \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbr %[n3], %[tn2] \n\t" + "preceu.ph.qbl %[n4], %[tn2] \n\t" + "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" + "ulw %[tp1], 7(%[src_ptr]) \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "extp %[Temp4], $ac2, 9 \n\t" + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), + [tn2] "=&r" (tn2), [p1] "=&r" (p1), [p2] "=&r" (p2), + [p3] "=&r" (p3), [p4] "=&r" (p4), [n1] "=&r" (n1), + [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr) + ); + + /* clamp and store results */ + dst_ptr[0] = cm[Temp1]; + dst_ptr[1] = cm[Temp2]; + dst_ptr[2] = cm[Temp3]; + dst_ptr[3] = cm[Temp4]; + + /* next 4 pixels */ + __asm__ __volatile__ ( + /* even 3. pixel */ + "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbr %[p2], %[tp1] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" + "extp %[Temp1], $ac3, 9 \n\t" + + /* odd 3. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "dpa.w.ph $ac3, %[n3], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n4], %[vector2b] \n\t" + "ulw %[tn1], 8(%[src_ptr]) \n\t" + "extp %[Temp3], $ac2, 9 \n\t" + + /* odd 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbr %[n2], %[tn1] \n\t" + "dpa.w.ph $ac2, %[n4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n2], %[vector2b] \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + "extp %[Temp4], $ac2, 9 \n\t" + + : [tn1] "=&r" (tn1), [p2] "=&r" (p2), [n2] "=&r" (n2), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4) + : [tp1] "r" (tp1), [p3] "r" (p3), [p4] "r" (p4), + [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr), + [n3] "r" (n3), [n4] "r" (n4) + ); + + /* clamp and store results */ + dst_ptr[4] = cm[Temp1]; + dst_ptr[5] = cm[Temp2]; + dst_ptr[6] = cm[Temp3]; + dst_ptr[7] = cm[Temp4]; + + /* next row... */ + src_ptr += src_pixels_per_line; + dst_ptr += pitch; + } + } + } +} + + +void vp8_filter_block2d_first_pass16_6tap +( + unsigned char *RESTRICT src_ptr, + unsigned char *RESTRICT dst_ptr, + unsigned int src_pixels_per_line, + unsigned int output_height, + int xoffset, + int pitch +) +{ + unsigned int i; + int Temp1, Temp2, Temp3, Temp4; + + unsigned int vector4a; + unsigned int vector1b, vector2b, vector3b; + unsigned int tp1, tp2, tn1, tn2; + unsigned int p1, p2, p3, p4; + unsigned int n1, n2, n3, n4; + unsigned char *cm = ff_cropTbl + CROP_WIDTH; + + vector1b = sub_pel_filters_inv[xoffset][0]; + vector2b = sub_pel_filters_inv[xoffset][1]; + vector3b = sub_pel_filters_inv[xoffset][2]; + vector4a = 64; + + for (i = output_height; i--;) + { + /* prefetch src_ptr data to cache memory */ + prefetch_load(src_ptr + src_pixels_per_line); + + /* apply filter with vectors pairs */ + __asm__ __volatile__ ( + "ulw %[tp1], -2(%[src_ptr]) \n\t" + "ulw %[tp2], 2(%[src_ptr]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbl %[p1], %[tp2] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" + + "balign %[tp2], %[tp1], 3 \n\t" + "ulw %[tn2], 3(%[src_ptr]) \n\t" + "extp %[Temp1], $ac3, 9 \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[n1], %[tp2] \n\t" + "preceu.ph.qbl %[n2], %[tp2] \n\t" + "preceu.ph.qbr %[n3], %[tn2] \n\t" + "extp %[Temp3], $ac2, 9 \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbl %[n1], %[tn2] \n\t" + "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "ulw %[tp1], 6(%[src_ptr]) \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[p2], %[tp1] \n\t" + "extp %[Temp4], $ac2, 9 \n\t" + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn2] "=&r" (tn2), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), + [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector4a] "r" (vector4a), [vector3b] "r" (vector3b), + [src_ptr] "r" (src_ptr) + ); + + /* clamp and store results */ + dst_ptr[0] = cm[Temp1]; + dst_ptr[1] = cm[Temp2]; + dst_ptr[2] = cm[Temp3]; + dst_ptr[3] = cm[Temp4]; + + /* next 4 pixels */ + __asm__ __volatile__ ( + /* even 3. pixel */ + "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector3b] \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbl %[p4], %[tp1] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "ulw %[tn1], 7(%[src_ptr]) \n\t" + "extp %[Temp1], $ac3, 9 \n\t" + + /* odd 3. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[n2], %[tn1] \n\t" + "dpa.w.ph $ac3, %[n3], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector3b] \n\t" + "extp %[Temp3], $ac2, 9 \n\t" + + /* odd 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbl %[n4], %[tn1] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n2], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" + "ulw %[tp2], 10(%[src_ptr]) \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "extp %[Temp4], $ac2, 9 \n\t" + + : [tn1] "=&r" (tn1), [tp2] "=&r" (tp2), [n2] "=&r" (n2), + [p4] "=&r" (p4), [n4] "=&r" (n4), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [tp1] "r" (tp1), [n1] "r" (n1), [p1] "r" (p1), + [vector4a] "r" (vector4a), [p2] "r" (p2), [vector3b] "r" (vector3b), + [p3] "r" (p3), [n3] "r" (n3), [src_ptr] "r" (src_ptr) + ); + + /* clamp and store results */ + dst_ptr[4] = cm[Temp1]; + dst_ptr[5] = cm[Temp2]; + dst_ptr[6] = cm[Temp3]; + dst_ptr[7] = cm[Temp4]; + + /* next 4 pixels */ + __asm__ __volatile__ ( + /* even 5. pixel */ + "dpa.w.ph $ac3, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" + + /* even 6. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbl %[p3], %[tp2] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector3b] \n\t" + + "ulw %[tn1], 11(%[src_ptr]) \n\t" + "extp %[Temp1], $ac3, 9 \n\t" + + /* odd 5. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[n1], %[tn1] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n4], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" + "extp %[Temp3], $ac2, 9 \n\t" + + /* odd 6. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbl %[n3], %[tn1] \n\t" + "dpa.w.ph $ac2, %[n4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n3], %[vector3b] \n\t" + "ulw %[tp1], 14(%[src_ptr]) \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[p4], %[tp1] \n\t" + "extp %[Temp4], $ac2, 9 \n\t" + + : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1), + [n1] "=&r" (n1), [p3] "=&r" (p3), [n3] "=&r" (n3), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [tp2] "r" (tp2), [p2] "r" (p2), [n2] "r" (n2), + [p4] "r" (p4), [n4] "r" (n4), [p1] "r" (p1), [src_ptr] "r" (src_ptr), + [vector4a] "r" (vector4a), [vector3b] "r" (vector3b) + ); + + /* clamp and store results */ + dst_ptr[8] = cm[Temp1]; + dst_ptr[9] = cm[Temp2]; + dst_ptr[10] = cm[Temp3]; + dst_ptr[11] = cm[Temp4]; + + /* next 4 pixels */ + __asm__ __volatile__ ( + /* even 7. pixel */ + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector3b] \n\t" + + /* even 8. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector3b] \n\t" + "ulw %[tn1], 15(%[src_ptr]) \n\t" + "extp %[Temp1], $ac3, 9 \n\t" + + /* odd 7. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[n4], %[tn1] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n3], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n4], %[vector3b] \n\t" + "extp %[Temp3], $ac2, 9 \n\t" + + /* odd 8. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbl %[n2], %[tn1] \n\t" + "dpa.w.ph $ac2, %[n3], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n4], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n2], %[vector3b] \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + "extp %[Temp4], $ac2, 9 \n\t" + + /* clamp and store results */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "lbux %[tn1], %[Temp2](%[cm]) \n\t" + "lbux %[p2], %[Temp3](%[cm]) \n\t" + "sb %[tp1], 12(%[dst_ptr]) \n\t" + "sb %[tn1], 13(%[dst_ptr]) \n\t" + "lbux %[n2], %[Temp4](%[cm]) \n\t" + "sb %[p2], 14(%[dst_ptr]) \n\t" + "sb %[n2], 15(%[dst_ptr]) \n\t" + + : [tn1] "=&r" (tn1), [p2] "=&r" (p2), [n2] "=&r" (n2), [n4] "=&r" (n4), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [tp1] "r" (tp1), [p4] "r" (p4), [n1] "r" (n1), [p1] "r" (p1), + [vector4a] "r" (vector4a), [vector3b] "r" (vector3b), [p3] "r" (p3), + [n3] "r" (n3), [src_ptr] "r" (src_ptr), + [cm] "r" (cm), [dst_ptr] "r" (dst_ptr) + ); + + src_ptr += src_pixels_per_line; + dst_ptr += pitch; + } +} + + +void vp8_filter_block2d_first_pass16_0 +( + unsigned char *RESTRICT src_ptr, + unsigned char *RESTRICT output_ptr, + unsigned int src_pixels_per_line +) +{ + int Temp1, Temp2, Temp3, Temp4; + int i; + + /* prefetch src_ptr data to cache memory */ + prefetch_store(output_ptr + 32); + + /* copy memory from src buffer to dst buffer */ + for (i = 0; i < 7; i++) + { + __asm__ __volatile__ ( + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "ulw %[Temp3], 8(%[src_ptr]) \n\t" + "ulw %[Temp4], 12(%[src_ptr]) \n\t" + "sw %[Temp1], 0(%[output_ptr]) \n\t" + "sw %[Temp2], 4(%[output_ptr]) \n\t" + "sw %[Temp3], 8(%[output_ptr]) \n\t" + "sw %[Temp4], 12(%[output_ptr]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), + [Temp4] "=&r" (Temp4), [src_ptr] "+r" (src_ptr) + : [src_pixels_per_line] "r" (src_pixels_per_line), + [output_ptr] "r" (output_ptr) + ); + + __asm__ __volatile__ ( + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "ulw %[Temp3], 8(%[src_ptr]) \n\t" + "ulw %[Temp4], 12(%[src_ptr]) \n\t" + "sw %[Temp1], 16(%[output_ptr]) \n\t" + "sw %[Temp2], 20(%[output_ptr]) \n\t" + "sw %[Temp3], 24(%[output_ptr]) \n\t" + "sw %[Temp4], 28(%[output_ptr]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), + [Temp4] "=&r" (Temp4), [src_ptr] "+r" (src_ptr) + : [src_pixels_per_line] "r" (src_pixels_per_line), + [output_ptr] "r" (output_ptr) + ); + + __asm__ __volatile__ ( + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "ulw %[Temp3], 8(%[src_ptr]) \n\t" + "ulw %[Temp4], 12(%[src_ptr]) \n\t" + "sw %[Temp1], 32(%[output_ptr]) \n\t" + "sw %[Temp2], 36(%[output_ptr]) \n\t" + "sw %[Temp3], 40(%[output_ptr]) \n\t" + "sw %[Temp4], 44(%[output_ptr]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), + [Temp4] "=&r" (Temp4), [src_ptr] "+r" (src_ptr) + : [src_pixels_per_line] "r" (src_pixels_per_line), + [output_ptr] "r" (output_ptr) + ); + + output_ptr += 48; + } +} + + +void vp8_filter_block2d_first_pass16_4tap +( + unsigned char *RESTRICT src_ptr, + unsigned char *RESTRICT output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_width, + unsigned int output_height, + int xoffset, + int yoffset, + unsigned char *RESTRICT dst_ptr, + int pitch +) +{ + unsigned int i, j; + int Temp1, Temp2, Temp3, Temp4; + + unsigned int vector4a; + int vector1b, vector2b; + unsigned int tp1, tp2, tp3, tn1; + unsigned int p1, p2, p3; + unsigned int n1, n2, n3; + unsigned char *cm = ff_cropTbl + CROP_WIDTH; + + vector4a = 64; + + vector1b = sub_pel_filters_inv_tap_4[xoffset][0]; + vector2b = sub_pel_filters_inv_tap_4[xoffset][1]; + + /* if (yoffset == 0) don't need temp buffer, data will be stored in dst_ptr */ + if (yoffset == 0) + { + output_height -= 5; + src_ptr += (src_pixels_per_line + src_pixels_per_line); + + for (i = output_height; i--;) + { + __asm__ __volatile__ ( + "ulw %[tp3], -1(%[src_ptr]) \n\t" + : [tp3] "=&r" (tp3) + : [src_ptr] "r" (src_ptr) + ); + + /* processing 4 adjacent pixels */ + for (j = 0; j < 16; j += 4) + { + /* apply filter with vectors pairs */ + __asm__ __volatile__ ( + "ulw %[tp2], 3(%[src_ptr]) \n\t" + "move %[tp1], %[tp3] \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $0, $ac3 \n\t" + "move %[tp3], %[tp2] \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $0, $ac2 \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "extr.w %[Temp1], $ac3, 7 \n\t" + + /* odd 1. pixel */ + "ulw %[tn1], 4(%[src_ptr]) \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $0, $ac3 \n\t" + "preceu.ph.qbr %[n1], %[tp2] \n\t" + "preceu.ph.qbl %[n2], %[tp2] \n\t" + "preceu.ph.qbr %[n3], %[tn1] \n\t" + "extr.w %[Temp3], $ac2, 7 \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $0, $ac2 \n\t" + "extr.w %[Temp2], $ac3, 7 \n\t" + "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" + "extr.w %[Temp4], $ac2, 7 \n\t" + + /* clamp and store results */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "lbux %[tn1], %[Temp2](%[cm]) \n\t" + "lbux %[tp2], %[Temp3](%[cm]) \n\t" + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "sb %[tn1], 1(%[dst_ptr]) \n\t" + "lbux %[n2], %[Temp4](%[cm]) \n\t" + "sb %[tp2], 2(%[dst_ptr]) \n\t" + "sb %[n2], 3(%[dst_ptr]) \n\t" + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3), + [tn1] "=&r" (tn1), [p1] "=&r" (p1), [p2] "=&r" (p2), + [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [p3] "=&r" (p3), + [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector4a] "r" (vector4a), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr), + [src_ptr] "r" (src_ptr) + ); + + src_ptr += 4; + } + + /* Next row... */ + src_ptr += src_pixels_per_line - 16; + dst_ptr += pitch; + } + } + else + { + for (i = output_height; i--;) + { + /* processing 4 adjacent pixels */ + for (j = 0; j < 16; j += 4) + { + /* apply filter with vectors pairs */ + __asm__ __volatile__ ( + "ulw %[tp1], -1(%[src_ptr]) \n\t" + "ulw %[tp2], 3(%[src_ptr]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $0, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $0, $ac2 \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "extr.w %[Temp1], $ac3, 7 \n\t" + + /* odd 1. pixel */ + "ulw %[tn1], 4(%[src_ptr]) \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $0, $ac3 \n\t" + "preceu.ph.qbr %[n1], %[tp2] \n\t" + "preceu.ph.qbl %[n2], %[tp2] \n\t" + "preceu.ph.qbr %[n3], %[tn1] \n\t" + "extr.w %[Temp3], $ac2, 7 \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $0, $ac2 \n\t" + "extr.w %[Temp2], $ac3, 7 \n\t" + "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" + "extr.w %[Temp4], $ac2, 7 \n\t" + + /* clamp and store results */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "lbux %[tn1], %[Temp2](%[cm]) \n\t" + "lbux %[tp2], %[Temp3](%[cm]) \n\t" + "sb %[tp1], 0(%[output_ptr]) \n\t" + "sb %[tn1], 1(%[output_ptr]) \n\t" + "lbux %[n2], %[Temp4](%[cm]) \n\t" + "sb %[tp2], 2(%[output_ptr]) \n\t" + "sb %[n2], 3(%[output_ptr]) \n\t" + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), + [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector4a] "r" (vector4a), [cm] "r" (cm), + [output_ptr] "r" (output_ptr), [src_ptr] "r" (src_ptr) + ); + + src_ptr += 4; + } + + /* next row... */ + src_ptr += src_pixels_per_line; + output_ptr += output_width; + } + } +} + + +void vp8_filter_block2d_second_pass4 +( + unsigned char *RESTRICT src_ptr, + unsigned char *RESTRICT output_ptr, + int output_pitch, + int yoffset +) +{ + unsigned int i; + + int Temp1, Temp2, Temp3, Temp4; + unsigned int vector1b, vector2b, vector3b, vector4a; + + unsigned char src_ptr_l2; + unsigned char src_ptr_l1; + unsigned char src_ptr_0; + unsigned char src_ptr_r1; + unsigned char src_ptr_r2; + unsigned char src_ptr_r3; + + unsigned char *cm = ff_cropTbl + CROP_WIDTH; + + vector4a = 64; + + /* load filter coefficients */ + vector1b = sub_pel_filterss[yoffset][0]; + vector2b = sub_pel_filterss[yoffset][2]; + vector3b = sub_pel_filterss[yoffset][1]; + + if (vector1b) + { + /* 6 tap filter */ + + for (i = 2; i--;) + { + /* prefetch src_ptr data to cache memory */ + prefetch_load(src_ptr); + + /* do not allow compiler to reorder instructions */ + __asm__ __volatile__ ( + ".set noreorder \n\t" + : + : + ); + + /* apply filter with vectors pairs */ + __asm__ __volatile__ ( + "lbu %[src_ptr_l2], -8(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -4(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 4(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 8(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 12(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -7(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -3(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 5(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 9(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 13(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "extp %[Temp1], $ac2, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -6(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -2(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 6(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 10(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 14(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac0 \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -5(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -1(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 7(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 11(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 15(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "extp %[Temp3], $ac0, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" + "extp %[Temp4], $ac1, 9 \n\t" + + : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4), + [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0), + [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2), + [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector3b] "r" (vector3b), [vector4a] "r" (vector4a), + [src_ptr] "r" (src_ptr) + ); + + /* clamp and store results */ + output_ptr[0] = cm[Temp1]; + output_ptr[1] = cm[Temp2]; + output_ptr[2] = cm[Temp3]; + output_ptr[3] = cm[Temp4]; + + output_ptr += output_pitch; + + /* apply filter with vectors pairs */ + __asm__ __volatile__ ( + "lbu %[src_ptr_l2], -4(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], 0(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 8(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 12(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 16(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -3(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], 1(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 9(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 13(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 17(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "extp %[Temp1], $ac2, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -2(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], 2(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 10(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 14(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 18(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac0 \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -1(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], 3(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 11(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 15(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 19(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "extp %[Temp3], $ac0, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" + "extp %[Temp4], $ac1, 9 \n\t" + + : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4), + [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0), + [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2), + [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector3b] "r" (vector3b), [vector4a] "r" (vector4a), + [src_ptr] "r" (src_ptr) + ); + + /* clamp and store results */ + output_ptr[0] = cm[Temp1]; + output_ptr[1] = cm[Temp2]; + output_ptr[2] = cm[Temp3]; + output_ptr[3] = cm[Temp4]; + + src_ptr += 8; + output_ptr += output_pitch; + } + } + else + { + /* 4 tap filter */ + + /* prefetch src_ptr data to cache memory */ + prefetch_load(src_ptr); + + for (i = 2; i--;) + { + /* do not allow compiler to reorder instructions */ + __asm__ __volatile__ ( + ".set noreorder \n\t" + : + : + ); + + /* apply filter with vectors pairs */ + __asm__ __volatile__ ( + "lbu %[src_ptr_l1], -4(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 4(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 8(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l1], -3(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 5(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 9(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "extp %[Temp1], $ac2, 9 \n\t" + + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l1], -2(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 6(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 10(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac0 \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l1], -1(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 7(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 11(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "extp %[Temp3], $ac0, 9 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" + "extp %[Temp4], $ac1, 9 \n\t" + + : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4), + [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0), + [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2) + : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b), + [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr) + ); + + /* clamp and store results */ + output_ptr[0] = cm[Temp1]; + output_ptr[1] = cm[Temp2]; + output_ptr[2] = cm[Temp3]; + output_ptr[3] = cm[Temp4]; + + output_ptr += output_pitch; + + /* apply filter with vectors pairs */ + __asm__ __volatile__ ( + "lbu %[src_ptr_l1], 0(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 8(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 12(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l1], 1(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 9(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 13(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "extp %[Temp1], $ac2, 9 \n\t" + + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l1], 2(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 10(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 14(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac0 \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l1], 3(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 11(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 15(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "extp %[Temp3], $ac0, 9 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" + "extp %[Temp4], $ac1, 9 \n\t" + + : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4), + [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0), + [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2) + : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b), + [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr) + ); + + /* clamp and store results */ + output_ptr[0] = cm[Temp1]; + output_ptr[1] = cm[Temp2]; + output_ptr[2] = cm[Temp3]; + output_ptr[3] = cm[Temp4]; + + src_ptr += 8; + output_ptr += output_pitch; + } + } +} + + +void vp8_filter_block2d_second_pass_8 +( + unsigned char *RESTRICT src_ptr, + unsigned char *RESTRICT output_ptr, + int output_pitch, + unsigned int output_height, + unsigned int output_width, + unsigned int yoffset +) +{ + unsigned int i; + + int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6, Temp7, Temp8; + unsigned int vector1b, vector2b, vector3b, vector4a; + + unsigned char src_ptr_l2; + unsigned char src_ptr_l1; + unsigned char src_ptr_0; + unsigned char src_ptr_r1; + unsigned char src_ptr_r2; + unsigned char src_ptr_r3; + unsigned char *cm = ff_cropTbl + CROP_WIDTH; + + vector4a = 64; + + vector1b = sub_pel_filterss[yoffset][0]; + vector2b = sub_pel_filterss[yoffset][2]; + vector3b = sub_pel_filterss[yoffset][1]; + + if (vector1b) + { + /* 6 tap filter */ + + /* prefetch src_ptr data to cache memory */ + prefetch_load(src_ptr); + + for (i = output_height; i--;) + { + /* apply filter with vectors pairs */ + __asm__ __volatile__ ( + "lbu %[src_ptr_l2], -16(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -8(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 8(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 16(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 24(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -15(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -7(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 9(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 17(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 25(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "extp %[Temp1], $ac2, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -14(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -6(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 10(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 18(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 26(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac0 \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -13(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -5(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 11(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 19(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 27(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "extp %[Temp3], $ac0, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" + + : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), + [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0), + [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2), + [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector3b] "r" (vector3b), [vector4a] "r" (vector4a), + [src_ptr] "r" (src_ptr) + ); + + /* apply filter with vectors pairs */ + __asm__ __volatile__ ( + "lbu %[src_ptr_l2], -12(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -4(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 12(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 20(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 28(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" + "extp %[Temp4], $ac1, 9 \n\t" + + "lbu %[src_ptr_l2], -11(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -3(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 13(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 21(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 29(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "extp %[Temp5], $ac2, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -10(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -2(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 14(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 22(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 30(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac0 \n\t" + "extp %[Temp6], $ac3, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -9(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -1(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 15(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 23(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 31(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "extp %[Temp7], $ac0, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" + "extp %[Temp8], $ac1, 9 \n\t" + + : [Temp4] "=&r" (Temp4), [Temp5] "=&r" (Temp5), + [Temp6] "=&r" (Temp6), [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8), + [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0), + [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2), + [src_ptr_l2] "=&r" (src_ptr_l2),[src_ptr_r3] "=&r" (src_ptr_r3) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector3b] "r" (vector3b), [vector4a] "r" (vector4a), + [src_ptr] "r" (src_ptr) + ); + + /* clamp and store results */ + output_ptr[0] = cm[Temp1]; + output_ptr[1] = cm[Temp2]; + output_ptr[2] = cm[Temp3]; + output_ptr[3] = cm[Temp4]; + output_ptr[4] = cm[Temp5]; + output_ptr[5] = cm[Temp6]; + output_ptr[6] = cm[Temp7]; + output_ptr[7] = cm[Temp8]; + + src_ptr += 8; + output_ptr += output_pitch; + } + } + else + { + /* 4 tap filter */ + + /* prefetch src_ptr data to cache memory */ + prefetch_load(src_ptr); + + for (i = output_height; i--;) + { + __asm__ __volatile__ ( + "lbu %[src_ptr_l1], -8(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 8(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 16(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" + + : [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0), + [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2) + : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b), + [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr) + ); + + __asm__ __volatile__ ( + "lbu %[src_ptr_l1], -7(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 9(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 17(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" + "extp %[Temp1], $ac2, 9 \n\t" + + : [Temp1] "=r" (Temp1), + [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0), + [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2) + : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b), + [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr) + ); + + src_ptr_l1 = src_ptr[-6]; + src_ptr_0 = src_ptr[2]; + src_ptr_r1 = src_ptr[10]; + src_ptr_r2 = src_ptr[18]; + + __asm__ __volatile__ ( + "mtlo %[vector4a], $ac0 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + + : [Temp2] "=r" (Temp2) + : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b), + [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0), + [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2), + [vector4a] "r" (vector4a) + ); + + src_ptr_l1 = src_ptr[-5]; + src_ptr_0 = src_ptr[3]; + src_ptr_r1 = src_ptr[11]; + src_ptr_r2 = src_ptr[19]; + + __asm__ __volatile__ ( + "mtlo %[vector4a], $ac1 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" + "extp %[Temp3], $ac0, 9 \n\t" + + : [Temp3] "=r" (Temp3) + : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b), + [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0), + [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2), + [vector4a] "r" (vector4a) + ); + + src_ptr_l1 = src_ptr[-4]; + src_ptr_0 = src_ptr[4]; + src_ptr_r1 = src_ptr[12]; + src_ptr_r2 = src_ptr[20]; + + __asm__ __volatile__ ( + "mtlo %[vector4a], $ac2 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" + "extp %[Temp4], $ac1, 9 \n\t" + + : [Temp4] "=r" (Temp4) + : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b), + [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0), + [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2), + [vector4a] "r" (vector4a) + ); + + src_ptr_l1 = src_ptr[-3]; + src_ptr_0 = src_ptr[5]; + src_ptr_r1 = src_ptr[13]; + src_ptr_r2 = src_ptr[21]; + + __asm__ __volatile__ ( + "mtlo %[vector4a], $ac3 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" + "extp %[Temp5], $ac2, 9 \n\t" + + : [Temp5] "=&r" (Temp5) + : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b), + [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0), + [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2), + [vector4a] "r" (vector4a) + ); + + src_ptr_l1 = src_ptr[-2]; + src_ptr_0 = src_ptr[6]; + src_ptr_r1 = src_ptr[14]; + src_ptr_r2 = src_ptr[22]; + + __asm__ __volatile__ ( + "mtlo %[vector4a], $ac0 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" + "extp %[Temp6], $ac3, 9 \n\t" + + : [Temp6] "=r" (Temp6) + : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b), + [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0), + [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2), + [vector4a] "r" (vector4a) + ); + + src_ptr_l1 = src_ptr[-1]; + src_ptr_0 = src_ptr[7]; + src_ptr_r1 = src_ptr[15]; + src_ptr_r2 = src_ptr[23]; + + __asm__ __volatile__ ( + "mtlo %[vector4a], $ac1 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" + "extp %[Temp7], $ac0, 9 \n\t" + "extp %[Temp8], $ac1, 9 \n\t" + + : [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8) + : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b), + [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0), + [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2), + [vector4a] "r" (vector4a) + ); + + /* clamp and store results */ + output_ptr[0] = cm[Temp1]; + output_ptr[1] = cm[Temp2]; + output_ptr[2] = cm[Temp3]; + output_ptr[3] = cm[Temp4]; + output_ptr[4] = cm[Temp5]; + output_ptr[5] = cm[Temp6]; + output_ptr[6] = cm[Temp7]; + output_ptr[7] = cm[Temp8]; + + src_ptr += 8; + output_ptr += output_pitch; + } + } +} + + +void vp8_filter_block2d_second_pass161 +( + unsigned char *RESTRICT src_ptr, + unsigned char *RESTRICT output_ptr, + int output_pitch, + const unsigned short *vp8_filter +) +{ + unsigned int i, j; + + int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6, Temp7, Temp8; + unsigned int vector4a; + unsigned int vector1b, vector2b, vector3b; + + unsigned char src_ptr_l2; + unsigned char src_ptr_l1; + unsigned char src_ptr_0; + unsigned char src_ptr_r1; + unsigned char src_ptr_r2; + unsigned char src_ptr_r3; + unsigned char *cm = ff_cropTbl + CROP_WIDTH; + + vector4a = 64; + + vector1b = vp8_filter[0]; + vector2b = vp8_filter[2]; + vector3b = vp8_filter[1]; + + if (vector1b == 0) + { + /* 4 tap filter */ + + /* prefetch src_ptr data to cache memory */ + prefetch_load(src_ptr + 16); + + for (i = 16; i--;) + { + /* unrolling for loop */ + for (j = 0; j < 16; j += 8) + { + /* apply filter with vectors pairs */ + __asm__ __volatile__ ( + "lbu %[src_ptr_l1], -16(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 16(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 32(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l1], -15(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 17(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 33(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "extp %[Temp1], $ac2, 9 \n\t" + + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l1], -14(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 18(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 34(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l1], -13(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 19(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 35(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "extp %[Temp3], $ac1, 9 \n\t" + + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l1], -12(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 20(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 36(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "extp %[Temp4], $ac3, 9 \n\t" + + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l1], -11(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 21(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 37(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "extp %[Temp5], $ac2, 9 \n\t" + + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l1], -10(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 22(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 38(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "extp %[Temp6], $ac3, 9 \n\t" + + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l1], -9(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 23(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 39(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "extp %[Temp7], $ac1, 9 \n\t" + + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" + "extp %[Temp8], $ac3, 9 \n\t" + + : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4), + [Temp5] "=&r" (Temp5), [Temp6] "=&r" (Temp6), + [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8), + [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0), + [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2) + : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b), + [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr) + ); + + /* clamp and store results */ + output_ptr[j] = cm[Temp1]; + output_ptr[j + 1] = cm[Temp2]; + output_ptr[j + 2] = cm[Temp3]; + output_ptr[j + 3] = cm[Temp4]; + output_ptr[j + 4] = cm[Temp5]; + output_ptr[j + 5] = cm[Temp6]; + output_ptr[j + 6] = cm[Temp7]; + output_ptr[j + 7] = cm[Temp8]; + + src_ptr += 8; + } + + output_ptr += output_pitch; + } + } + else + { + /* 4 tap filter */ + + /* prefetch src_ptr data to cache memory */ + prefetch_load(src_ptr + 16); + + /* unroll for loop */ + for (i = 16; i--;) + { + /* apply filter with vectors pairs */ + __asm__ __volatile__ ( + "lbu %[src_ptr_l2], -32(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -16(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 16(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 32(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 48(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -31(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -15(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 17(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 33(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 49(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac0 \n\t" + "extp %[Temp1], $ac2, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -30(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -14(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 18(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 34(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 50(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "extp %[Temp2], $ac0, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -29(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -13(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 19(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 35(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 51(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "extp %[Temp3], $ac1, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -28(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -12(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 20(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 36(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 52(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "extp %[Temp4], $ac3, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -27(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -11(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 21(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 37(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 53(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac0 \n\t" + "extp %[Temp5], $ac2, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -26(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -10(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 22(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 38(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 54(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "extp %[Temp6], $ac0, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -25(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -9(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 23(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 39(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 55(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "extp %[Temp7], $ac1, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" + "extp %[Temp8], $ac3, 9 \n\t" + + : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4), + [Temp5] "=&r" (Temp5), [Temp6] "=&r" (Temp6), + [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8), + [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0), + [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2), + [src_ptr_l2] "=&r" (src_ptr_l2),[src_ptr_r3] "=&r" (src_ptr_r3) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector3b] "r" (vector3b), [vector4a] "r" (vector4a), + [src_ptr] "r" (src_ptr) + ); + + /* clamp and store results */ + output_ptr[0] = cm[Temp1]; + output_ptr[1] = cm[Temp2]; + output_ptr[2] = cm[Temp3]; + output_ptr[3] = cm[Temp4]; + output_ptr[4] = cm[Temp5]; + output_ptr[5] = cm[Temp6]; + output_ptr[6] = cm[Temp7]; + output_ptr[7] = cm[Temp8]; + + /* apply filter with vectors pairs */ + __asm__ __volatile__ ( + "lbu %[src_ptr_l2], -24(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -8(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 8(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 24(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 40(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 56(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -23(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -7(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 9(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 25(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 41(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 57(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac0 \n\t" + "extp %[Temp1], $ac2, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -22(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -6(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 10(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 26(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 42(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 58(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "extp %[Temp2], $ac0, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -21(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -5(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 11(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 27(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 43(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 59(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "extp %[Temp3], $ac1, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -20(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -4(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 12(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 28(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 44(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 60(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "extp %[Temp4], $ac3, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -19(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -3(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 13(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 29(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 45(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 61(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac0 \n\t" + "extp %[Temp5], $ac2, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -18(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -2(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 14(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 30(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 46(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 62(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "extp %[Temp6], $ac0, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -17(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -1(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 15(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 31(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 47(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 63(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "extp %[Temp7], $ac1, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" + "extp %[Temp8], $ac3, 9 \n\t" + + : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4), + [Temp5] "=&r" (Temp5), [Temp6] "=&r" (Temp6), + [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8), + [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0), + [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2), + [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector3b] "r" (vector3b), [vector4a] "r" (vector4a), + [src_ptr] "r" (src_ptr) + ); + + src_ptr += 16; + output_ptr[8] = cm[Temp1]; + output_ptr[9] = cm[Temp2]; + output_ptr[10] = cm[Temp3]; + output_ptr[11] = cm[Temp4]; + output_ptr[12] = cm[Temp5]; + output_ptr[13] = cm[Temp6]; + output_ptr[14] = cm[Temp7]; + output_ptr[15] = cm[Temp8]; + + output_ptr += output_pitch; + } + } +} + + +void vp8_sixtap_predict4x4_dspr2 +( + unsigned char *RESTRICT src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *RESTRICT dst_ptr, + int dst_pitch +) +{ + unsigned char FData[9 * 4]; /* Temp data bufffer used in filtering */ + unsigned int pos = 16; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + if (yoffset) + { + /* First filter 1-D horizontally... */ + vp8_filter_block2d_first_pass_4(src_ptr - (2 * src_pixels_per_line), FData, + src_pixels_per_line, 9, xoffset, 4); + /* then filter verticaly... */ + vp8_filter_block2d_second_pass4(FData + 8, dst_ptr, dst_pitch, yoffset); + } + else + /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */ + vp8_filter_block2d_first_pass_4(src_ptr, dst_ptr, src_pixels_per_line, + 4, xoffset, dst_pitch); +} + + +void vp8_sixtap_predict8x8_dspr2 +( + unsigned char *RESTRICT src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *RESTRICT dst_ptr, + int dst_pitch +) +{ + + unsigned char FData[13 * 8]; /* Temp data bufffer used in filtering */ + unsigned int pos, Temp1, Temp2; + + pos = 16; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + if (yoffset) + { + + src_ptr = src_ptr - (2 * src_pixels_per_line); + + if (xoffset) + /* filter 1-D horizontally... */ + vp8_filter_block2d_first_pass_8_all(src_ptr, FData, src_pixels_per_line, + 13, xoffset, 8); + + else + { + /* prefetch src_ptr data to cache memory */ + prefetch_load(src_ptr + 2 * src_pixels_per_line); + + __asm__ __volatile__ ( + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 0(%[FData]) \n\t" + "sw %[Temp2], 4(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 8(%[FData]) \n\t" + "sw %[Temp2], 12(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 16(%[FData]) \n\t" + "sw %[Temp2], 20(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 24(%[FData]) \n\t" + "sw %[Temp2], 28(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 32(%[FData]) \n\t" + "sw %[Temp2], 36(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 40(%[FData]) \n\t" + "sw %[Temp2], 44(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 48(%[FData]) \n\t" + "sw %[Temp2], 52(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 56(%[FData]) \n\t" + "sw %[Temp2], 60(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 64(%[FData]) \n\t" + "sw %[Temp2], 68(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 72(%[FData]) \n\t" + "sw %[Temp2], 76(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 80(%[FData]) \n\t" + "sw %[Temp2], 84(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 88(%[FData]) \n\t" + "sw %[Temp2], 92(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 96(%[FData]) \n\t" + "sw %[Temp2], 100(%[FData]) \n\t" + + : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2) + : [FData] "r" (FData), [src_ptr] "r" (src_ptr), + [src_pixels_per_line] "r" (src_pixels_per_line) + ); + } + + /* filter verticaly... */ + vp8_filter_block2d_second_pass_8(FData + 16, dst_ptr, dst_pitch, 8, 8, yoffset); + } + + /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */ + else + { + if (xoffset) + vp8_filter_block2d_first_pass_8_all(src_ptr, dst_ptr, src_pixels_per_line, + 8, xoffset, dst_pitch); + + else + { + /* copy from src buffer to dst buffer */ + __asm__ __volatile__ ( + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 0(%[dst_ptr]) \n\t" + "sw %[Temp2], 4(%[dst_ptr]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 8(%[dst_ptr]) \n\t" + "sw %[Temp2], 12(%[dst_ptr]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 16(%[dst_ptr]) \n\t" + "sw %[Temp2], 20(%[dst_ptr]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 24(%[dst_ptr]) \n\t" + "sw %[Temp2], 28(%[dst_ptr]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 32(%[dst_ptr]) \n\t" + "sw %[Temp2], 36(%[dst_ptr]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 40(%[dst_ptr]) \n\t" + "sw %[Temp2], 44(%[dst_ptr]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 48(%[dst_ptr]) \n\t" + "sw %[Temp2], 52(%[dst_ptr]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 56(%[dst_ptr]) \n\t" + "sw %[Temp2], 60(%[dst_ptr]) \n\t" + + : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2) + : [dst_ptr] "r" (dst_ptr), [src_ptr] "r" (src_ptr), + [src_pixels_per_line] "r" (src_pixels_per_line) + ); + } + } +} + + +void vp8_sixtap_predict8x4_dspr2 +( + unsigned char *RESTRICT src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *RESTRICT dst_ptr, + int dst_pitch +) +{ + unsigned char FData[9 * 8]; /* Temp data bufffer used in filtering */ + unsigned int pos, Temp1, Temp2; + + pos = 16; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + if (yoffset) + { + + src_ptr = src_ptr - (2 * src_pixels_per_line); + + if (xoffset) + /* filter 1-D horizontally... */ + vp8_filter_block2d_first_pass_8_all(src_ptr, FData, src_pixels_per_line, + 9, xoffset, 8); + + else + { + /* prefetch src_ptr data to cache memory */ + prefetch_load(src_ptr + 2 * src_pixels_per_line); + + __asm__ __volatile__ ( + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 0(%[FData]) \n\t" + "sw %[Temp2], 4(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 8(%[FData]) \n\t" + "sw %[Temp2], 12(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 16(%[FData]) \n\t" + "sw %[Temp2], 20(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 24(%[FData]) \n\t" + "sw %[Temp2], 28(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 32(%[FData]) \n\t" + "sw %[Temp2], 36(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 40(%[FData]) \n\t" + "sw %[Temp2], 44(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 48(%[FData]) \n\t" + "sw %[Temp2], 52(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 56(%[FData]) \n\t" + "sw %[Temp2], 60(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 64(%[FData]) \n\t" + "sw %[Temp2], 68(%[FData]) \n\t" + + : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2) + : [FData] "r" (FData), [src_ptr] "r" (src_ptr), + [src_pixels_per_line] "r" (src_pixels_per_line) + ); + } + + /* filter verticaly... */ + vp8_filter_block2d_second_pass_8(FData + 16, dst_ptr, dst_pitch, 4, 8, yoffset); + } + + /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */ + else + { + if (xoffset) + vp8_filter_block2d_first_pass_8_all(src_ptr, dst_ptr, src_pixels_per_line, + 4, xoffset, dst_pitch); + + else + { + /* copy from src buffer to dst buffer */ + __asm__ __volatile__ ( + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 0(%[dst_ptr]) \n\t" + "sw %[Temp2], 4(%[dst_ptr]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 8(%[dst_ptr]) \n\t" + "sw %[Temp2], 12(%[dst_ptr]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 16(%[dst_ptr]) \n\t" + "sw %[Temp2], 20(%[dst_ptr]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 24(%[dst_ptr]) \n\t" + "sw %[Temp2], 28(%[dst_ptr]) \n\t" + + : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2) + : [dst_ptr] "r" (dst_ptr), [src_ptr] "r" (src_ptr), + [src_pixels_per_line] "r" (src_pixels_per_line) + ); + } + } +} + + +void vp8_sixtap_predict16x16_dspr2 +( + unsigned char *RESTRICT src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *RESTRICT dst_ptr, + int dst_pitch +) +{ + const unsigned short *VFilter; + unsigned char FData[21 * 16]; /* Temp data bufffer used in filtering */ + unsigned int pos; + + VFilter = sub_pel_filterss[yoffset]; + + pos = 16; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + if (yoffset) + { + + src_ptr = src_ptr - (2 * src_pixels_per_line); + + switch (xoffset) + { + /* filter 1-D horizontally... */ + case 2: + case 4: + case 6: + /* 6 tap filter */ + vp8_filter_block2d_first_pass16_6tap(src_ptr, FData, src_pixels_per_line, + 21, xoffset, 16); + break; + + case 0: + /* only copy buffer */ + vp8_filter_block2d_first_pass16_0(src_ptr, FData, src_pixels_per_line); + break; + + case 1: + case 3: + case 5: + case 7: + /* 4 tap filter */ + vp8_filter_block2d_first_pass16_4tap(src_ptr, FData, src_pixels_per_line, 16, + 21, xoffset, yoffset, dst_ptr, dst_pitch); + break; + } + + /* filter verticaly... */ + vp8_filter_block2d_second_pass161(FData + 32, dst_ptr, dst_pitch, VFilter); + } + else + { + /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */ + switch (xoffset) + { + case 2: + case 4: + case 6: + /* 6 tap filter */ + vp8_filter_block2d_first_pass16_6tap(src_ptr, dst_ptr, src_pixels_per_line, + 16, xoffset, dst_pitch); + break; + + case 1: + case 3: + case 5: + case 7: + /* 4 tap filter */ + vp8_filter_block2d_first_pass16_4tap(src_ptr, dst_ptr, src_pixels_per_line, 16, + 21, xoffset, yoffset, dst_ptr, dst_pitch); + break; + } + } +} + +#endif diff --git a/vp8/common/mips/dspr2/idct_blk_dspr2.c b/vp8/common/mips/dspr2/idct_blk_dspr2.c new file mode 100644 index 0000000000000000000000000000000000000000..1e0ebd161e4195c05517e5b69ac93998f24f1415 --- /dev/null +++ b/vp8/common/mips/dspr2/idct_blk_dspr2.c @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vpx_rtcd.h" + +#if HAVE_DSPR2 + +void vp8_dequant_idct_add_y_block_dspr2 +(short *q, short *dq, + unsigned char *dst, int stride, char *eobs) +{ + int i, j; + + for (i = 0; i < 4; i++) + { + for (j = 0; j < 4; j++) + { + if (*eobs++ > 1) + vp8_dequant_idct_add_dspr2(q, dq, dst, stride); + else + { + vp8_dc_only_idct_add_dspr2(q[0]*dq[0], dst, stride, dst, stride); + ((int *)q)[0] = 0; + } + + q += 16; + dst += 4; + } + + dst += 4 * stride - 16; + } +} + +void vp8_dequant_idct_add_uv_block_dspr2 +(short *q, short *dq, + unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) +{ + int i, j; + + for (i = 0; i < 2; i++) + { + for (j = 0; j < 2; j++) + { + if (*eobs++ > 1) + vp8_dequant_idct_add_dspr2(q, dq, dstu, stride); + else + { + vp8_dc_only_idct_add_dspr2(q[0]*dq[0], dstu, stride, dstu, stride); + ((int *)q)[0] = 0; + } + + q += 16; + dstu += 4; + } + + dstu += 4 * stride - 8; + } + + for (i = 0; i < 2; i++) + { + for (j = 0; j < 2; j++) + { + if (*eobs++ > 1) + vp8_dequant_idct_add_dspr2(q, dq, dstv, stride); + else + { + vp8_dc_only_idct_add_dspr2(q[0]*dq[0], dstv, stride, dstv, stride); + ((int *)q)[0] = 0; + } + + q += 16; + dstv += 4; + } + + dstv += 4 * stride - 8; + } +} + +#endif + diff --git a/vp8/common/mips/dspr2/idctllm_dspr2.c b/vp8/common/mips/dspr2/idctllm_dspr2.c new file mode 100644 index 0000000000000000000000000000000000000000..25b7936431591decef8e4765b4a256575a802195 --- /dev/null +++ b/vp8/common/mips/dspr2/idctllm_dspr2.c @@ -0,0 +1,369 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_rtcd.h" + +#if HAVE_DSPR2 +#define CROP_WIDTH 256 + +/****************************************************************************** + * Notes: + * + * This implementation makes use of 16 bit fixed point version of two multiply + * constants: + * 1. sqrt(2) * cos (pi/8) + * 2. sqrt(2) * sin (pi/8) + * Since the first constant is bigger than 1, to maintain the same 16 bit + * fixed point precision as the second one, we use a trick of + * x * a = x + x*(a-1) + * so + * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). + ****************************************************************************/ +extern unsigned char ff_cropTbl[256 + 2 * CROP_WIDTH]; +static const int cospi8sqrt2minus1 = 20091; +static const int sinpi8sqrt2 = 35468; + +inline void prefetch_load_short(short *src) +{ + __asm__ __volatile__ ( + "pref 0, 0(%[src]) \n\t" + : + : [src] "r" (src) + ); +} + +void vp8_short_idct4x4llm_dspr2(short *input, unsigned char *pred_ptr, + int pred_stride, unsigned char *dst_ptr, + int dst_stride) +{ + int r, c; + int a1, b1, c1, d1; + short output[16]; + short *ip = input; + short *op = output; + int temp1, temp2; + int shortpitch = 4; + + int c2, d2; + int temp3, temp4; + unsigned char *cm = ff_cropTbl + CROP_WIDTH; + + /* prepare data for load */ + prefetch_load_short(ip + 8); + + /* first loop is unrolled */ + a1 = ip[0] + ip[8]; + b1 = ip[0] - ip[8]; + + temp1 = (ip[4] * sinpi8sqrt2) >> 16; + temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16); + c1 = temp1 - temp2; + + temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16); + temp2 = (ip[12] * sinpi8sqrt2) >> 16; + d1 = temp1 + temp2; + + temp3 = (ip[5] * sinpi8sqrt2) >> 16; + temp4 = ip[13] + ((ip[13] * cospi8sqrt2minus1) >> 16); + c2 = temp3 - temp4; + + temp3 = ip[5] + ((ip[5] * cospi8sqrt2minus1) >> 16); + temp4 = (ip[13] * sinpi8sqrt2) >> 16; + d2 = temp3 + temp4; + + op[0] = a1 + d1; + op[12] = a1 - d1; + op[4] = b1 + c1; + op[8] = b1 - c1; + + a1 = ip[1] + ip[9]; + b1 = ip[1] - ip[9]; + + op[1] = a1 + d2; + op[13] = a1 - d2; + op[5] = b1 + c2; + op[9] = b1 - c2; + + a1 = ip[2] + ip[10]; + b1 = ip[2] - ip[10]; + + temp1 = (ip[6] * sinpi8sqrt2) >> 16; + temp2 = ip[14] + ((ip[14] * cospi8sqrt2minus1) >> 16); + c1 = temp1 - temp2; + + temp1 = ip[6] + ((ip[6] * cospi8sqrt2minus1) >> 16); + temp2 = (ip[14] * sinpi8sqrt2) >> 16; + d1 = temp1 + temp2; + + temp3 = (ip[7] * sinpi8sqrt2) >> 16; + temp4 = ip[15] + ((ip[15] * cospi8sqrt2minus1) >> 16); + c2 = temp3 - temp4; + + temp3 = ip[7] + ((ip[7] * cospi8sqrt2minus1) >> 16); + temp4 = (ip[15] * sinpi8sqrt2) >> 16; + d2 = temp3 + temp4; + + op[2] = a1 + d1; + op[14] = a1 - d1; + op[6] = b1 + c1; + op[10] = b1 - c1; + + a1 = ip[3] + ip[11]; + b1 = ip[3] - ip[11]; + + op[3] = a1 + d2; + op[15] = a1 - d2; + op[7] = b1 + c2; + op[11] = b1 - c2; + + ip = output; + + /* prepare data for load */ + prefetch_load_short(ip + shortpitch); + + /* second loop is unrolled */ + a1 = ip[0] + ip[2]; + b1 = ip[0] - ip[2]; + + temp1 = (ip[1] * sinpi8sqrt2) >> 16; + temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16); + c1 = temp1 - temp2; + + temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16); + temp2 = (ip[3] * sinpi8sqrt2) >> 16; + d1 = temp1 + temp2; + + temp3 = (ip[5] * sinpi8sqrt2) >> 16; + temp4 = ip[7] + ((ip[7] * cospi8sqrt2minus1) >> 16); + c2 = temp3 - temp4; + + temp3 = ip[5] + ((ip[5] * cospi8sqrt2minus1) >> 16); + temp4 = (ip[7] * sinpi8sqrt2) >> 16; + d2 = temp3 + temp4; + + op[0] = (a1 + d1 + 4) >> 3; + op[3] = (a1 - d1 + 4) >> 3; + op[1] = (b1 + c1 + 4) >> 3; + op[2] = (b1 - c1 + 4) >> 3; + + a1 = ip[4] + ip[6]; + b1 = ip[4] - ip[6]; + + op[4] = (a1 + d2 + 4) >> 3; + op[7] = (a1 - d2 + 4) >> 3; + op[5] = (b1 + c2 + 4) >> 3; + op[6] = (b1 - c2 + 4) >>