Commit 712762b5 authored by John Koleszar's avatar John Koleszar
Browse files

Merge remote branch 'origin/master' into experimental

Change-Id: Ic698ea5f5b31a5faf467eb0da4b762f9586df938
parents 87e570e6 56417a30
......@@ -2,3 +2,4 @@ Adrian Grange <agrange@google.com>
Johann Koenig <johannkoenig@google.com>
Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
Tom Finegan <tomfinegan@google.com>
Ralph Giles <giles@xiph.org> <giles@entropywave.com>
......@@ -4,8 +4,11 @@
Aaron Watry <awatry@gmail.com>
Adrian Grange <agrange@google.com>
Alex Converse <alex.converse@gmail.com>
Alexis Ballier <aballier@gentoo.org>
Alok Ahuja <waveletcoeff@gmail.com>
Andoni Morales Alastruey <ylatuya@gmail.com>
Andres Mejia <mcitadel@gmail.com>
Aron Rosenberg <arosenberg@logitech.com>
Attila Nagy <attilanagy@google.com>
Fabio Pedretti <fabio.ped@libero.it>
Frank Galligan <fgalligan@google.com>
......@@ -22,20 +25,29 @@ Jeff Muizelaar <jmuizelaar@mozilla.com>
Jim Bankoski <jimbankoski@google.com>
Johann Koenig <johannkoenig@google.com>
John Koleszar <jkoleszar@google.com>
Joshua Bleecher Snyder <josh@treelinelabs.com>
Justin Clift <justin@salasaga.org>
Justin Lebar <justin.lebar@gmail.com>
Lou Quillio <louquillio@google.com>
Luca Barbato <lu_zero@gentoo.org>
Makoto Kato <makoto.kt@gmail.com>
Martin Ettl <ettl.martin78@googlemail.com>
Michael Kohler <michaelkohler@live.com>
Mike Hommey <mhommey@mozilla.com>
Mikhal Shemer <mikhal@google.com>
Pascal Massimino <pascal.massimino@gmail.com>
Patrik Westin <patrik.westin@gmail.com>
Paul Wilkins <paulwilkins@google.com>
Pavol Rusnak <stick@gk2.sk>
Philip Jägenstedt <philipj@opera.com>
Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
Ralph Giles <giles@xiph.org>
Ronald S. Bultje <rbultje@google.com>
Scott LaVarnway <slavarnway@google.com>
Stefan Holmer <holmer@google.com>
Taekhyun Kim <takim@nvidia.com>
Tero Rintaluoma <teror@google.com>
Thijs Vermeir <thijsvermeir@gmail.com>
Timothy B. Terriberry <tterribe@xiph.org>
Tom Finegan <tomfinegan@google.com>
Yaowu Xu <yaowu@google.com>
......
2011-08-02 v0.9.7 "Cayuga"
Our third named release, focused on a faster, higher quality, encoder.
- Upgrading:
This release is backwards compatible with Aylesbury (v0.9.5) and
Bali (v0.9.6). Users of older releases should refer to the Upgrading
notes in this document for that release.
- Enhancements:
Stereo 3D format support for vpxenc
Runtime detection of available processor cores.
Allow specifying --end-usage by enum name
vpxdec: test for frame corruption
vpxenc: add quantizer histogram display
vpxenc: add rate histogram display
Set VPX_FRAME_IS_DROPPABLE
update configure for ios sdk 4.3
Avoid text relocations in ARM vp8 decoder
Generate a vpx.pc file for pkg-config.
New ways of passing encoded data between encoder and decoder.
- Speed:
This release includes across-the-board speed improvements to the
encoder. On x86, these measure at approximately 11.5% in Best mode,
21.5% in Good mode (speed 0), and 22.5% in Realtime mode (speed 6).
On ARM Cortex A9 with Neon extensions, real-time encoding of video
telephony content is 35% faster than Bali on single core and 48%
faster on multi-core. On the NVidia Tegra2 platform, real time
encoding is 40% faster than Bali.
Decoder speed was not a priority for this release, but improved
approximately 8.4% on x86.
Reduce motion vector search on alt-ref frame.
Encoder loopfilter running in its own thread
Reworked loopfilter to precalculate more parameters
SSE2/SSSE3 optimizations for build_predictors_mbuv{,_s}().
Make hor UV predict ~2x faster (73 vs 132 cycles) using SSSE3.
Removed redundant checks
Reduced structure sizes
utilize preload in ARMv6 MC/LPF/Copy routines
ARM optimized quantization, dfct, variance, subtract
Increase chrow row alignment to 16 bytes.
disable trellis optimization for first pass
Write SSSE3 sub-pixel filter function
Improve SSE2 half-pixel filter funtions
Add vp8_sub_pixel_variance16x8_ssse3 function
Reduce unnecessary distortion computation
Use diamond search to replace full search
Preload reference area in sub-pixel motion search (real-time mode)
- Quality:
This release focused primarily on one-pass use cases, including
video conferencing. Low latency data rate control was significantly
improved, improving streamability over bandwidth constrained links.
Added support for error concealment, allowing frames to maintain
visual quality in the presence of substantial packet loss.
Add rc_max_intra_bitrate_pct control
Limit size of initial keyframe in one-pass.
Improve framerate adaptation
Improved 1-pass CBR rate control
Improved KF insertion after fades to still.
Improved key frame detection.
Improved activity masking (lower PSNR impact for same SSIM boost)
Improved interaction between GF and ARFs
Adding error-concealment to the decoder.
Adding support for independent partitions
Adjusted rate-distortion constants
- Bug Fixes:
Removed firstpass motion map
Fix parallel make install
Fix multithreaded encoding for 1 MB wide frame
Fixed iwalsh_neon build problems with RVDS4.1
Fix semaphore emulation, spin-wait intrinsics on Windows
Fix build with xcode4 and simplify GLOBAL.
Mark ARM asm objects as allowing a non-executable stack.
Fix vpxenc encoding incorrect webm file header on big endian
2011-03-07 v0.9.6 "Bali"
Our second named release, focused on a faster, higher quality, encoder.
......
......@@ -336,6 +336,7 @@ ifneq ($(call enabled,DIST-SRCS),)
DIST-SRCS-$(CONFIG_MSVS) += build/make/gen_msvs_proj.sh
DIST-SRCS-$(CONFIG_MSVS) += build/make/gen_msvs_sln.sh
DIST-SRCS-$(CONFIG_MSVS) += build/x86-msvs/yasm.rules
DIST-SRCS-$(CONFIG_MSVS) += build/x86-msvs/obj_int_extract.bat
DIST-SRCS-$(CONFIG_RVCT) += build/make/armlink_adapter.sh
# Include obj_int_extract if we use offsets from asm_*_offsets
DIST-SRCS-$(ARCH_ARM)$(ARCH_X86)$(ARCH_X86_64) += build/make/obj_int_extract.c
......
......@@ -952,6 +952,10 @@ process_common_toolchain() {
# shared objects
enabled gcc && enabled pic && check_add_cflags -fPIC
# Work around longjmp interception on glibc >= 2.11, to improve binary
# compatibility. See http://code.google.com/p/webm/issues/detail?id=166
enabled linux && check_add_cflags -D_FORTIFY_SOURCE=0
# Check for strip utility variant
${STRIP} -V 2>/dev/null | grep GNU >/dev/null && enable gnu_strip
......
......@@ -35,6 +35,7 @@ ifeq ($(CONFIG_VP8_ENCODER),yes)
CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_CX_SRCS))
CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_CX_EXPORTS))
CODEC_SRCS-yes += $(VP8_PREFIX)vp8cx.mk vpx/vp8.h vpx/vp8cx.h vpx/vp8e.h
CODEC_SRCS-$(ARCH_ARM) += $(VP8_PREFIX)vp8cx_arm.mk
INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8e.h include/vpx/vp8cx.h
INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/%
CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h
......@@ -47,6 +48,7 @@ ifeq ($(CONFIG_VP8_DECODER),yes)
CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_DX_SRCS))
CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_DX_EXPORTS))
CODEC_SRCS-yes += $(VP8_PREFIX)vp8dx.mk vpx/vp8.h vpx/vp8dx.h
CODEC_SRCS-$(ARCH_ARM) += $(VP8_PREFIX)vp8dx_arm.mk
INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8dx.h
INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/%
CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8dx.h
......@@ -89,6 +91,7 @@ $(eval $(if $(filter universal%,$(TOOLCHAIN)),LIPO_LIBVPX,BUILD_LIBVPX):=yes)
CODEC_SRCS-$(BUILD_LIBVPX) += build/make/version.sh
CODEC_SRCS-$(BUILD_LIBVPX) += vpx/vpx_integer.h
CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/asm_offsets.h
CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/vpx_timer.h
CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/mem.h
CODEC_SRCS-$(BUILD_LIBVPX) += $(BUILD_PFX)vpx_config.c
......@@ -100,7 +103,7 @@ CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/x86_abi_support.asm
CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/x86_cpuid.c
endif
CODEC_SRCS-$(ARCH_ARM) += vpx_ports/arm_cpudetect.c
CODEC_SRCS-$(ARCH_ARM) += $(BUILD_PFX)vpx_config.asm
CODEC_SRCS-$(ARCH_ARM) += vpx_ports/arm.h
CODEC_EXPORTS-$(BUILD_LIBVPX) += vpx/exports_com
CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc
CODEC_EXPORTS-$(CONFIG_DECODERS) += vpx/exports_dec
......@@ -177,7 +180,7 @@ endif
else
LIBVPX_OBJS=$(call objs,$(CODEC_SRCS))
OBJS-$(BUILD_LIBVPX) += $(LIBVPX_OBJS)
LIBS-$(CONFIG_STATIC) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a
LIBS-$(if $(BUILD_LIBVPX),$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a
$(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
BUILD_LIBVPX_SO := $(if $(BUILD_LIBVPX),$(CONFIG_SHARED))
......@@ -269,20 +272,20 @@ $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
#
ifeq ($(filter icc gcc,$(TGT_CC)), $(TGT_CC))
asm_com_offsets.asm: $(VP8_PREFIX)common/asm_com_offsets.c.S
$(BUILD_PFX)asm_com_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S
grep EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
$(VP8_PREFIX)common/asm_com_offsets.c.S: vp8/common/asm_com_offsets.c
CLEAN-OBJS += asm_com_offsets.asm $(VP8_PREFIX)common/asm_com_offsets.c.S
$(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S: $(VP8_PREFIX)common/asm_com_offsets.c
CLEAN-OBJS += $(BUILD_PFX)asm_com_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S
asm_enc_offsets.asm: $(VP8_PREFIX)encoder/asm_enc_offsets.c.S
$(BUILD_PFX)asm_enc_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S
grep EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
$(VP8_PREFIX)encoder/asm_enc_offsets.c.S: vp8/encoder/asm_enc_offsets.c
CLEAN-OBJS += asm_enc_offsets.asm $(VP8_PREFIX)encoder/asm_enc_offsets.c.S
$(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S: $(VP8_PREFIX)encoder/asm_enc_offsets.c
CLEAN-OBJS += $(BUILD_PFX)asm_enc_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S
asm_dec_offsets.asm: $(VP8_PREFIX)decoder/asm_dec_offsets.c.S
$(BUILD_PFX)asm_dec_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S
grep EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
$(VP8_PREFIX)decoder/asm_dec_offsets.c.S: vp8/decoder/asm_dec_offsets.c
CLEAN-OBJS += asm_dec_offsets.asm $(VP8_PREFIX)decoder/asm_dec_offsets.c.S
$(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S: $(VP8_PREFIX)decoder/asm_dec_offsets.c
CLEAN-OBJS += $(BUILD_PFX)asm_dec_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S
else
ifeq ($(filter rvct,$(TGT_CC)), $(TGT_CC))
asm_com_offsets.asm: obj_int_extract
......
......@@ -9,6 +9,8 @@
*/
#include "vpx_config.h"
#include "vpx/vpx_codec.h"
#include "vpx_ports/asm_offsets.h"
#include "vpx_scale/yv12config.h"
......@@ -25,8 +27,14 @@ DEFINE(yv12_buffer_config_y_buffer, offsetof(YV12_BUFFER_CONFIG, y_b
DEFINE(yv12_buffer_config_u_buffer, offsetof(YV12_BUFFER_CONFIG, u_buffer));
DEFINE(yv12_buffer_config_v_buffer, offsetof(YV12_BUFFER_CONFIG, v_buffer));
DEFINE(yv12_buffer_config_border, offsetof(YV12_BUFFER_CONFIG, border));
DEFINE(VP8BORDERINPIXELS_VAL, VP8BORDERINPIXELS);
END
/* add asserts for any offset that is not supported by assembly code */
/* add asserts for any size that is not supported by assembly code */
#if HAVE_ARMV7
/* vp8_yv12_extend_frame_borders_neon makes several assumptions based on this */
ct_assert(VP8BORDERINPIXELS_VAL, VP8BORDERINPIXELS == 32)
#endif
......@@ -19,7 +19,9 @@
#include "entropy.h"
#include "idct.h"
#include "recon.h"
#if CONFIG_POSTPROC
#include "postproc.h"
#endif
/*#ifdef PACKET_TESTING*/
#include "header.h"
......@@ -75,7 +77,9 @@ typedef struct VP8_COMMON_RTCD
vp8_recon_rtcd_vtable_t recon;
vp8_subpix_rtcd_vtable_t subpix;
vp8_loopfilter_rtcd_vtable_t loopfilter;
#if CONFIG_POSTPROC
vp8_postproc_rtcd_vtable_t postproc;
#endif
int flags;
#else
int unused;
......@@ -202,7 +206,9 @@ typedef struct VP8Common
#if CONFIG_MULTITHREAD
int processor_core_count;
#endif
#if CONFIG_POSTPROC
struct postproc_state postproc_state;
#endif
} VP8_COMMON;
#endif
......@@ -13,7 +13,6 @@
#include "vpx_ports/arm.h"
#include "vp8/common/blockd.h"
#include "vp8/common/pragmas.h"
#include "vp8/common/postproc.h"
#include "vp8/decoder/dequantize.h"
#include "vp8/decoder/onyxd_int.h"
......
......@@ -9,6 +9,7 @@
*/
#include "vpx_config.h"
#include "vp8/common/onyxc_int.h"
#include "onyx_int.h"
#include "vp8/common/systemdependent.h"
......@@ -24,7 +25,9 @@
#include "segmentation.h"
#include "vp8/common/g_common.h"
#include "vpx_scale/yv12extend.h"
#if CONFIG_POSTPROC
#include "vp8/common/postproc.h"
#endif
#include "vpx_mem/vpx_mem.h"
#include "vp8/common/swapyv12buffer.h"
#include "vp8/common/threading.h"
......@@ -2660,6 +2663,8 @@ static void scale_and_extend_source(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
cpi->Source = &cpi->scaled_source;
#endif
}
else
cpi->Source = sd;
}
......
......@@ -24,7 +24,6 @@
#include "segmentation.h"
#include "vp8/common/g_common.h"
#include "vpx_scale/yv12extend.h"
#include "vp8/common/postproc.h"
#include "vpx_mem/vpx_mem.h"
#include "vp8/common/swapyv12buffer.h"
#include "vp8/common/threading.h"
......
......@@ -15,6 +15,7 @@ VP8_COMMON_SRCS-yes += common/ppflags.h
VP8_COMMON_SRCS-yes += common/onyx.h
VP8_COMMON_SRCS-yes += common/onyxd.h
VP8_COMMON_SRCS-yes += common/alloccommon.c
VP8_COMMON_SRCS-yes += common/asm_com_offsets.c
VP8_COMMON_SRCS-yes += common/blockd.c
VP8_COMMON_SRCS-yes += common/coefupdateprobs.h
VP8_COMMON_SRCS-yes += common/debugmodes.c
......@@ -101,14 +102,16 @@ VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/postproc_sse2.asm
endif
# common (c)
VP8_COMMON_SRCS-$(ARCH_ARM) += common/asm_com_offsets.c
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/arm_systemdependent.c
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/bilinearfilter_arm.c
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/bilinearfilter_arm.h
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/filter_arm.c
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/idct_arm.h
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/loopfilter_arm.c
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/loopfilter_arm.h
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/recon_arm.h
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/reconintra_arm.c
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/subpixel_arm.h
# common (armv6)
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/bilinearfilter_v6$(ASM)
......
......@@ -15,9 +15,12 @@
# encoder
VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/arm_csystemdependent.c
VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/quantize_arm.c
VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/picklpf_arm.c
VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/dct_arm.c
VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/dct_arm.h
VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/encodemb_arm.h
VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/picklpf_arm.c
VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/quantize_arm.c
VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/quantize_arm.h
VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/variance_arm.c
VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/variance_arm.h
......
......@@ -48,6 +48,7 @@ VP8_DX_SRCS-yes += vp8_dx_iface.c
#INCLUDES += common
#INCLUDES += decoder
VP8_DX_SRCS-yes += decoder/asm_dec_offsets.c
VP8_DX_SRCS-yes += decoder/dboolhuff.c
VP8_DX_SRCS-yes += decoder/decodemv.c
VP8_DX_SRCS-yes += decoder/decodframe.c
......
......@@ -12,9 +12,8 @@
#VP8_DX_SRCS list is modified according to different platforms.
VP8_DX_SRCS-$(ARCH_ARM) += decoder/arm/arm_dsystemdependent.c
VP8_CX_SRCS-$(ARCH_ARM) += decoder/asm_dec_offsets.c
VP8_DX_SRCS-$(ARCH_ARM) += decoder/arm/dequantize_arm.c
VP8_DX_SRCS-$(ARCH_ARM) += decoder/arm/dequantize_arm.h
#File list for armv6
VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_dc_idct_v6$(ASM)
......
......@@ -18,42 +18,32 @@
AREA ||.text||, CODE, READONLY, ALIGN=2
;void vp8_yv12_extend_frame_borders_neon (YV12_BUFFER_CONFIG *ybf);
;Note: this is VP8 function, which has border=32 and 16. Internal y_width and y_height
; are always multiples of 16.
; we depend on VP8BORDERINPIXELS being 32
|vp8_yv12_extend_frame_borders_neon| PROC
push {r4 - r10, lr}
vpush {d8 - d15}
;Not need to load y_width, since: y_width = y_stride - 2*border
ldr r3, [r0, #yv12_buffer_config_border]
ldr r1, [r0, #yv12_buffer_config_y_buffer] ;srcptr1
ldr r4, [r0, #yv12_buffer_config_y_height]
ldr lr, [r0, #yv12_buffer_config_y_stride]
; Border = 32
ldr r3, [r0, #yv12_buffer_config_y_width] ; plane_width
ldr r1, [r0, #yv12_buffer_config_y_buffer] ; src_ptr1
ldr r4, [r0, #yv12_buffer_config_y_height] ; plane_height
ldr lr, [r0, #yv12_buffer_config_y_stride] ; plane_stride
cmp r3, #16
beq b16_extend_frame_borders
; Border copy for Y plane
; copy the left and right most columns out
add r6, r1, r3 ; dest_ptr2 = src_ptr2 + 1 (src_ptr1 + plane_width)
sub r2, r6, #1 ; src_ptr2 = src_ptr1 + plane_width - 1
sub r5, r1, #32 ; dest_ptr1 = src_ptr1 - Border
;=======================
b32_extend_frame_borders
;border = 32
;=======================
;Border copy for Y plane
;copy the left and right most columns out
sub r5, r1, r3 ;destptr1
add r6, r1, lr
sub r6, r6, r3, lsl #1 ;destptr2
sub r2, r6, #1 ;srcptr2
;Do four rows at one time
mov r12, r4, lsr #2
mov r12, r4, lsr #2 ; plane_height / 4
copy_left_right_y
vld1.8 {d0[], d1[]}, [r1], lr
vld1.8 {d4[], d5[]}, [r2], lr
vld1.8 {d8[], d9[]}, [r1], lr
vld1.8 {d12[], d13[]}, [r2], lr
vld1.8 {d16[], d17[]}, [r1], lr
vld1.8 {d16[], d17[]}, [r1], lr
vld1.8 {d20[], d21[]}, [r2], lr
vld1.8 {d24[], d25[]}, [r1], lr
vld1.8 {d28[], d29[]}, [r2], lr
......@@ -81,15 +71,16 @@ copy_left_right_y
bne copy_left_right_y
;Now copy the top and bottom source lines into each line of the respective borders
ldr r7, [r0, #yv12_buffer_config_y_buffer] ;srcptr1
mul r8, r3, lr
ldr r1, [r0, #yv12_buffer_config_y_buffer] ; y_buffer
mul r8, r4, lr ; plane_height * plane_stride
mov r12, lr, lsr #7
; copy width is plane_stride
mov r12, lr, lsr #7 ; plane_stride / 128
sub r6, r1, r3 ;destptr2
sub r2, r6, lr ;srcptr2
sub r1, r7, r3 ;srcptr1
sub r5, r1, r8 ;destptr1
sub r1, r1, #32 ; src_ptr1 = y_buffer - Border
add r6, r1, r8 ; dest_ptr2 = src_ptr2 - plane_stride (src_ptr1 + (plane_height * plane_stride))
sub r2, r6, lr ; src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride
sub r5, r1, lr, asl #5 ; dest_ptr1 = src_ptr1 - (Border * plane_stride)
copy_top_bottom_y
vld1.8 {q0, q1}, [r1]!
......@@ -101,7 +92,7 @@ copy_top_bottom_y
vld1.8 {q6, q7}, [r1]!
vld1.8 {q14, q15}, [r2]!
mov r7, r3
mov r7, #32 ; Border
top_bottom_32
subs r7, r7, #1
......@@ -115,44 +106,41 @@ top_bottom_32
vst1.8 {q6, q7}, [r5]!
vst1.8 {q14, q15}, [r6]!
add r5, r5, lr
sub r5, r5, #128
add r6, r6, lr
sub r6, r6, #128
add r5, r5, lr ; dest_ptr1 += plane_stride
sub r5, r5, #128 ; dest_ptr1 -= 128
add r6, r6, lr ; dest_ptr2 += plane_stride
sub r6, r6, #128 ; dest_ptr2 -= 128
bne top_bottom_32
sub r5, r1, r8
add r6, r2, lr
sub r5, r1, lr, asl #5 ; src_ptr1 - (Border* plane_stride)
add r6, r2, lr ; src_ptr2 + plane_stride
subs r12, r12, #1
bne copy_top_bottom_y
mov r7, lr, lsr #4 ;check to see if extra copy is needed
mov r7, lr, lsr #4 ; check to see if extra copy is needed
ands r7, r7, #0x7
bne extra_top_bottom_y
end_of_border_copy_y
;Border copy for U, V planes
ldr r1, [r0, #yv12_buffer_config_u_buffer] ;srcptr1
mov lr, lr, lsr #1 ;uv_stride
mov r3, r3, lsr #1 ;border
mov r4, r4, lsr #1 ;uv_height
mov r8, r8, lsr #2
; Border = 16
ldr r7, [r0, #yv12_buffer_config_u_buffer] ; src_ptr1
ldr lr, [r0, #yv12_buffer_config_uv_stride] ; plane_stride
ldr r3, [r0, #yv12_buffer_config_uv_width] ; plane_width
ldr r4, [r0, #yv12_buffer_config_uv_height] ; plane_height
mov r10, #2
;copy the left and right most columns out
border_copy_uv
sub r5, r1, r3 ;destptr1
add r6, r1, lr
sub r6, r6, r3, lsl #1 ;destptr2
sub r2, r6, #1 ;srcptr2
mov r1, r7 ; src_ptr1 needs to be saved for second half of loop
sub r5, r1, #16 ; dest_ptr1 = src_ptr1 - Border
add r6, r1, r3 ; dest_ptr2 = src_ptr2 + 1 (src_ptr1 + plane_width)
sub r2, r6, #1 ; src_ptr2 = src_ptr1 + plane_width - 1
mov r7, r1
;Do eight rows at one time
mov r12, r4, lsr #3
mov r12, r4, lsr #3 ; plane_height / 8
copy_left_right_uv
vld1.8 {d0[], d1[]}, [r1], lr
......@@ -167,7 +155,7 @@ copy_left_right_uv
vld1.8 {d18[], d19[]}, [r2], lr
vld1.8 {d20[], d21[]}, [r1], lr
vld1.8 {d22[], d23[]}, [r2], lr
vld1.8 {d24[], d25[]}, [r1], lr
vld1.8 {d24[], d25[]}, [r1], lr
vld1.8 {d26[], d27[]}, [r2], lr
vld1.8 {d28[], d29[]}, [r1], lr
vld1.8 {d30[], d31[]}, [r2], lr
......@@ -194,12 +182,14 @@ copy_left_right_uv
bne copy_left_right_uv
;Now copy the top and bottom source lines into each line of the respective borders
mov r12, lr, lsr #6
mov r1, r7
mul r8, r4, lr ; plane_height * plane_stride
mov r12, lr, lsr #6 ; plane_stride / 64
sub r6, r1, r3 ;destptr2
sub r2, r6, lr ;srcptr2
sub r1, r7, r3 ;srcptr1
sub r5, r1, r8 ;destptr1
sub r1, r1, #16 ; src_ptr1 = u_buffer - Border
add r6, r1, r8 ; dest_ptr2 = src_ptr2 + plane_stride (src_ptr1 + (plane_height * plane_stride)
sub r2, r6, lr ; src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride
sub r5, r1, lr, asl #4 ; dest_ptr1 = src_ptr1 - (Border * plane_stride)
copy_top_bottom_uv
vld1.8 {q0, q1}, [r1]!
......@@ -207,7 +197,7 @@ copy_top_bottom_uv
vld1.8 {q2, q3}, [r1]!
vld1.8 {q10, q11}, [r2]!
mov r7, r3
mov r7, #16 ; Border
top_bottom_16
subs r7, r7, #1
......@@ -217,38 +207,37 @@ top_bottom_16
vst1.8 {q2, q3}, [r5]!
vst1.8 {q10, q11}, [r6]!
add r5, r5, lr
add r5, r5, lr ; dest_ptr1 += plane_stride
sub r5, r5, #64
add r6, r6, lr
add r6, r6, lr ; dest_ptr2 += plane_stride
sub r6, r6, #64
bne top_bottom_16
sub r5, r1, r8
add r6, r2, lr
sub r5, r1, lr, asl #4 ; dest_ptr1 = src_ptr1 - (Border * plane_stride)
add r6, r2, lr ; dest_ptr2 = src_ptr2 + plane_stride
subs r12, r12, #1
bne copy_top_bottom_uv
mov r7, lr, lsr #3 ;check to see if extra copy is needed
mov r7, lr, lsr #3 ; check to see if extra copy is needed
ands r7, r7, #0x7
bne extra_top_bottom_uv
end_of_border_copy_uv
subs r10, r10, #1
ldrne r1, [r0, #yv12_buffer_config_v_buffer] ;srcptr1
ldrne r7, [r0, #yv12_buffer_config_v_buffer] ; src_ptr1
bne border_copy_uv
vpop {d8 - d15}
pop {r4 - r10, pc}
;;;;;;;;;;;;;;;;;;;;;;
;extra copy part for Y
extra_top_bottom_y
vld1.8 {q0}, [r1]!
vld1.8 {q2}, [r2]!
mov r9, r3, lsr #3