Commit b02fdf09 authored by John Koleszar's avatar John Koleszar
Browse files

Merge remote branch 'origin/master' into experimental

Change-Id: I4e515276d197e1dfb1f3e75edfa9823d08c9b366
parents cc63deba 17c754fc
......@@ -281,17 +281,17 @@ $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
ifeq ($(filter icc gcc,$(TGT_CC)), $(TGT_CC))
$(BUILD_PFX)asm_com_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S
grep EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
grep -w EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
$(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S: $(VP8_PREFIX)common/asm_com_offsets.c
CLEAN-OBJS += $(BUILD_PFX)asm_com_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S
$(BUILD_PFX)asm_enc_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S
grep EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
grep -w EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
$(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S: $(VP8_PREFIX)encoder/asm_enc_offsets.c
CLEAN-OBJS += $(BUILD_PFX)asm_enc_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S
$(BUILD_PFX)asm_dec_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S
grep EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
grep -w EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
$(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S: $(VP8_PREFIX)decoder/asm_dec_offsets.c
CLEAN-OBJS += $(BUILD_PFX)asm_dec_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S
else
......
......@@ -58,10 +58,10 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
/*cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_c;*/
/*cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c;
cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c;*/
cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_armv6;
cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_armv6;
cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_armv6;
cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_armv6;
cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_armv6;
cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_armv6;
cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_armv6;
/*cpi->rtcd.encodemb.berr = vp8_block_error_c;
......@@ -107,8 +107,8 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_neon;
cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_neon;
cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_neon;
cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_neon;
cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_neon;
cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_neon;
cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_neon;
/*cpi->rtcd.encodemb.berr = vp8_block_error_c;
......
......@@ -53,10 +53,10 @@
sub r7, r5, #1 ; range-1
cmp r1, #0
mul r4, r4, r7 ; ((range-1) * probability)
mul r6, r4, r7 ; ((range-1) * probability)
mov r7, #1
add r4, r7, r4, lsr #8 ; 1 + (((range-1) * probability) >> 8)
add r4, r7, r6, lsr #8 ; 1 + (((range-1) * probability) >> 8)
addne r2, r2, r4 ; if (bit) lowvalue += split
subne r4, r5, r4 ; if (bit) range = range-split
......
......@@ -71,7 +71,7 @@ token_loop
; off of v, so set a flag here based on this.
; This value is refered to as "bb"
lsls r12, r12, #1 ; bb = v >> n
mul r4, r4, r7 ; ((range-1) * pp[i>>1]))
mul r6, r4, r7 ; ((range-1) * pp[i>>1]))
; bb can only be 0 or 1. So only execute this statement
; if bb == 1, otherwise it will act like i + 0
......@@ -79,7 +79,7 @@ token_loop
mov r7, #1
ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
add r4, r7, r4, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
addcs r2, r2, r4 ; if (bb) lowvalue += split
subcs r4, r5, r4 ; if (bb) range = range-split
......@@ -172,12 +172,12 @@ extra_bits_loop
ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
sub r7, r5, #1 ; range-1
lsls r12, r12, #1 ; v >> n
mul r4, r4, r7 ; (range-1) * pp[i>>1]
mul r6, r4, r7 ; (range-1) * pp[i>>1]
addcs lr, lr, #1 ; i + bb
mov r7, #1
ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
add r4, r7, r4, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
addcs r2, r2, r4 ; if (bb) lowvalue += split
subcs r4, r5, r4 ; if (bb) range = range-split
......
......@@ -93,7 +93,7 @@ token_loop
; off of v, so set a flag here based on this.
; This value is refered to as "bb"
lsls r12, r12, #1 ; bb = v >> n
mul r4, r4, r7 ; ((range-1) * pp[i>>1]))
mul r6, r4, r7 ; ((range-1) * pp[i>>1]))
; bb can only be 0 or 1. So only execute this statement
; if bb == 1, otherwise it will act like i + 0
......@@ -101,7 +101,7 @@ token_loop
mov r7, #1
ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
add r4, r7, r4, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
addcs r2, r2, r4 ; if (bb) lowvalue += split
subcs r4, r5, r4 ; if (bb) range = range-split
......@@ -194,12 +194,12 @@ extra_bits_loop
ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
sub r7, r5, #1 ; range-1
lsls r12, r12, #1 ; v >> n
mul r4, r4, r7 ; (range-1) * pp[i>>1]
mul r6, r4, r7 ; (range-1) * pp[i>>1]
addcs lr, lr, #1 ; i + bb
mov r7, #1
ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
add r4, r7, r4, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
addcs r2, r2, r4 ; if (bb) lowvalue += split
subcs r4, r5, r4 ; if (bb) range = range-split
......
......@@ -123,7 +123,7 @@ token_loop
; off of v, so set a flag here based on this.
; This value is refered to as "bb"
lsls r12, r12, #1 ; bb = v >> n
mul r4, r4, r7 ; ((range-1) * pp[i>>1]))
mul r6, r4, r7 ; ((range-1) * pp[i>>1]))
; bb can only be 0 or 1. So only execute this statement
; if bb == 1, otherwise it will act like i + 0
......@@ -131,7 +131,7 @@ token_loop
mov r7, #1
ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
add r4, r7, r4, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
addcs r2, r2, r4 ; if (bb) lowvalue += split
subcs r4, r5, r4 ; if (bb) range = range-split
......@@ -224,12 +224,12 @@ extra_bits_loop
ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
sub r7, r5, #1 ; range-1
lsls r12, r12, #1 ; v >> n
mul r4, r4, r7 ; (range-1) * pp[i>>1]
mul r6, r4, r7 ; (range-1) * pp[i>>1]
addcs lr, lr, #1 ; i + bb
mov r7, #1
ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
add r4, r7, r4, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
addcs r2, r2, r4 ; if (bb) lowvalue += split
subcs r4, r5, r4 ; if (bb) range = range-split
......
......@@ -8,7 +8,7 @@
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_fast_fdct4x4_armv6|
EXPORT |vp8_short_fdct4x4_armv6|
ARM
REQUIRE8
......@@ -16,7 +16,7 @@
AREA |.text|, CODE, READONLY
; void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
|vp8_fast_fdct4x4_armv6| PROC
|vp8_short_fdct4x4_armv6| PROC
stmfd sp!, {r4 - r12, lr}
......
......@@ -17,129 +17,196 @@
AREA |.text|, CODE, READONLY ; name this block of code
;short vp8_short_walsh4x4_armv6(short *input, short *output, int pitch)
; r0 short *input,
; r1 short *output,
; r2 int pitch
|vp8_short_walsh4x4_armv6| PROC
stmdb sp!, {r4 - r11, lr}
mov r12, r2 ; ugh. not clean
ldr r2, [r0] ; [1 | 0]
ldr r3, [r0, #4] ; [3 | 2]
ldr r4, [r0, r12]! ; [5 | 4]
ldr r5, [r0, #4] ; [7 | 6]
ldr r6, [r0, r12]! ; [9 | 8]
ldr r7, [r0, #4] ; [11 | 10]
ldr r8, [r0, r12]! ; [13 | 12]
ldr r9, [r0, #4] ; [15 | 14]
qsubaddx r10, r2, r3 ; [c1|a1] [1-2 | 0+3]
qaddsubx r11, r2, r3 ; [b1|d1] [1+2 | 0-3]
qsubaddx r12, r4, r5 ; [c1|a1] [5-6 | 4+7]
qaddsubx lr, r4, r5 ; [b1|d1] [5+6 | 4-7]
qaddsubx r2, r10, r11 ; [1 | 2] [c1+d1 | a1-b1]
qaddsubx r3, r11, r10 ; [0 | 3] [b1+a1 | d1-c1]
qaddsubx r4, r12, lr ; [5 | 6] [c1+d1 | a1-b1]
qaddsubx r5, lr, r12 ; [4 | 7] [b1+a1 | d1-c1]
qsubaddx r10, r6, r7 ; [c1|a1] [9-10 | 8+11]
qaddsubx r11, r6, r7 ; [b1|d1] [9+10 | 8-11]
qsubaddx r12, r8, r9 ; [c1|a1] [13-14 | 12+15]
qaddsubx lr, r8, r9 ; [b1|d1] [13+14 | 12-15]
qaddsubx r6, r10, r11 ; [9 |10] [c1+d1 | a1-b1]
qaddsubx r7, r11, r10 ; [8 |11] [b1+a1 | d1-c1]
qaddsubx r8, r12, lr ; [13|14] [c1+d1 | a1-b1]
qaddsubx r9, lr, r12 ; [12|15] [b1+a1 | d1-c1]
; first transform complete
qadd16 r10, r3, r9 ; a1 [0+12 | 3+15]
qadd16 r11, r5, r7 ; b1 [4+8 | 7+11]
qsub16 r12, r5, r7 ; c1 [4-8 | 7-11]
qsub16 lr, r3, r9 ; d1 [0-12 | 3-15]
qadd16 r3, r10, r11 ; a2 [a1+b1] [0 | 3]
qadd16 r5, r12, lr ; b2 [c1+d1] [4 | 7]
qsub16 r7, r10, r11 ; c2 [a1-b1] [8 |11]
qsub16 r9, lr, r12 ; d2 [d1-c1] [12|15]
qadd16 r10, r2, r8 ; a1 [1+13 | 2+14]
qadd16 r11, r4, r6 ; b1 [5+9 | 6+10]
qsub16 r12, r4, r6 ; c1 [5-9 | 6-10]
qsub16 lr, r2, r8 ; d1 [1-13 | 2-14]
qadd16 r2, r10, r11 ; a2 [a1+b1] [1 | 2]
qadd16 r4, r12, lr ; b2 [c1+d1] [5 | 6]
qsub16 r6, r10, r11 ; c2 [a1-b1] [9 |10]
qsub16 r8, lr, r12 ; d2 [d1-c1] [13|14]
; [a-d]2 += ([a-d]2 > 0)
asrs r10, r3, #16
addpl r10, r10, #1 ; [~0]
asrs r11, r2, #16
addpl r11, r11, #1 ; [~1]
lsl r11, r11, #15 ; [1 | x]
pkhtb r10, r11, r10, asr #1; [1 | 0]
str r10, [r1], #4
lsls r11, r2, #16
addpl r11, r11, #0x10000 ; [~2]
lsls r12, r3, #16
addpl r12, r12, #0x10000 ; [~3]
asr r12, r12, #1 ; [3 | x]
pkhtb r11, r12, r11, asr #17; [3 | 2]
str r11, [r1], #4
asrs r2, r5, #16
addpl r2, r2, #1 ; [~4]
asrs r3, r4, #16
addpl r3, r3, #1 ; [~5]
lsl r3, r3, #15 ; [5 | x]
pkhtb r2, r3, r2, asr #1 ; [5 | 4]
str r2, [r1], #4
lsls r2, r4, #16
addpl r2, r2, #0x10000 ; [~6]
lsls r3, r5, #16
addpl r3, r3, #0x10000 ; [~7]
asr r3, r3, #1 ; [7 | x]
pkhtb r2, r3, r2, asr #17 ; [7 | 6]
str r2, [r1], #4
asrs r2, r7, #16
addpl r2, r2, #1 ; [~8]
asrs r3, r6, #16
addpl r3, r3, #1 ; [~9]
lsl r3, r3, #15 ; [9 | x]
pkhtb r2, r3, r2, asr #1 ; [9 | 8]
str r2, [r1], #4
lsls r2, r6, #16
addpl r2, r2, #0x10000 ; [~10]
lsls r3, r7, #16
addpl r3, r3, #0x10000 ; [~11]
asr r3, r3, #1 ; [11 | x]
pkhtb r2, r3, r2, asr #17 ; [11 | 10]
str r2, [r1], #4
asrs r2, r9, #16
addpl r2, r2, #1 ; [~12]
asrs r3, r8, #16
addpl r3, r3, #1 ; [~13]
lsl r3, r3, #15 ; [13 | x]
pkhtb r2, r3, r2, asr #1 ; [13 | 12]
str r2, [r1], #4
lsls r2, r8, #16
addpl r2, r2, #0x10000 ; [~14]
lsls r3, r9, #16
addpl r3, r3, #0x10000 ; [~15]
asr r3, r3, #1 ; [15 | x]
pkhtb r2, r3, r2, asr #17 ; [15 | 14]
str r2, [r1]
ldrd r4, r5, [r0], r2
ldr lr, c00040004
ldrd r6, r7, [r0], r2
; 0-3
qadd16 r3, r4, r5 ; [d1|a1] [1+3 | 0+2]
qsub16 r4, r4, r5 ; [c1|b1] [1-3 | 0-2]
ldrd r8, r9, [r0], r2
; 4-7
qadd16 r5, r6, r7 ; [d1|a1] [5+7 | 4+6]
qsub16 r6, r6, r7 ; [c1|b1] [5-7 | 4-6]
ldrd r10, r11, [r0]
; 8-11
qadd16 r7, r8, r9 ; [d1|a1] [9+11 | 8+10]
qsub16 r8, r8, r9 ; [c1|b1] [9-11 | 8-10]
; 12-15
qadd16 r9, r10, r11 ; [d1|a1] [13+15 | 12+14]
qsub16 r10, r10, r11 ; [c1|b1] [13-15 | 12-14]
lsls r2, r3, #16
smuad r11, r3, lr ; A0 = a1<<2 + d1<<2
addne r11, r11, #1 ; A0 += (a1!=0)
lsls r2, r7, #16
smuad r12, r7, lr ; C0 = a1<<2 + d1<<2
addne r12, r12, #1 ; C0 += (a1!=0)
add r0, r11, r12 ; a1_0 = A0 + C0
sub r11, r11, r12 ; b1_0 = A0 - C0
lsls r2, r5, #16
smuad r12, r5, lr ; B0 = a1<<2 + d1<<2
addne r12, r12, #1 ; B0 += (a1!=0)
lsls r2, r9, #16
smuad r2, r9, lr ; D0 = a1<<2 + d1<<2
addne r2, r2, #1 ; D0 += (a1!=0)
add lr, r12, r2 ; d1_0 = B0 + D0
sub r12, r12, r2 ; c1_0 = B0 - D0
; op[0,4,8,12]
adds r2, r0, lr ; a2 = a1_0 + d1_0
addmi r2, r2, #1 ; += a2 < 0
add r2, r2, #3 ; += 3
subs r0, r0, lr ; d2 = a1_0 - d1_0
mov r2, r2, asr #3 ; >> 3
strh r2, [r1] ; op[0]
addmi r0, r0, #1 ; += a2 < 0
add r0, r0, #3 ; += 3
ldr lr, c00040004
mov r0, r0, asr #3 ; >> 3
strh r0, [r1, #24] ; op[12]
adds r2, r11, r12 ; b2 = b1_0 + c1_0
addmi r2, r2, #1 ; += a2 < 0
add r2, r2, #3 ; += 3
subs r0, r11, r12 ; c2 = b1_0 - c1_0
mov r2, r2, asr #3 ; >> 3
strh r2, [r1, #8] ; op[4]
addmi r0, r0, #1 ; += a2 < 0
add r0, r0, #3 ; += 3
smusd r3, r3, lr ; A3 = a1<<2 - d1<<2
smusd r7, r7, lr ; C3 = a1<<2 - d1<<2
mov r0, r0, asr #3 ; >> 3
strh r0, [r1, #16] ; op[8]
; op[3,7,11,15]
add r0, r3, r7 ; a1_3 = A3 + C3
sub r3, r3, r7 ; b1_3 = A3 - C3
smusd r5, r5, lr ; B3 = a1<<2 - d1<<2
smusd r9, r9, lr ; D3 = a1<<2 - d1<<2
add r7, r5, r9 ; d1_3 = B3 + D3
sub r5, r5, r9 ; c1_3 = B3 - D3
adds r2, r0, r7 ; a2 = a1_3 + d1_3
addmi r2, r2, #1 ; += a2 < 0
add r2, r2, #3 ; += 3
adds r9, r3, r5 ; b2 = b1_3 + c1_3
mov r2, r2, asr #3 ; >> 3
strh r2, [r1, #6] ; op[3]
addmi r9, r9, #1 ; += a2 < 0
add r9, r9, #3 ; += 3
subs r2, r3, r5 ; c2 = b1_3 - c1_3
mov r9, r9, asr #3 ; >> 3
strh r9, [r1, #14] ; op[7]
addmi r2, r2, #1 ; += a2 < 0
add r2, r2, #3 ; += 3
subs r9, r0, r7 ; d2 = a1_3 - d1_3
mov r2, r2, asr #3 ; >> 3
strh r2, [r1, #22] ; op[11]
addmi r9, r9, #1 ; += a2 < 0
add r9, r9, #3 ; += 3
smuad r3, r4, lr ; A1 = b1<<2 + c1<<2
smuad r5, r8, lr ; C1 = b1<<2 + c1<<2
mov r9, r9, asr #3 ; >> 3
strh r9, [r1, #30] ; op[15]
; op[1,5,9,13]
add r0, r3, r5 ; a1_1 = A1 + C1
sub r3, r3, r5 ; b1_1 = A1 - C1
smuad r7, r6, lr ; B1 = b1<<2 + c1<<2
smuad r9, r10, lr ; D1 = b1<<2 + c1<<2
add r5, r7, r9 ; d1_1 = B1 + D1
sub r7, r7, r9 ; c1_1 = B1 - D1
adds r2, r0, r5 ; a2 = a1_1 + d1_1
addmi r2, r2, #1 ; += a2 < 0
add r2, r2, #3 ; += 3
adds r9, r3, r7 ; b2 = b1_1 + c1_1
mov r2, r2, asr #3 ; >> 3
strh r2, [r1, #2] ; op[1]
addmi r9, r9, #1 ; += a2 < 0
add r9, r9, #3 ; += 3
subs r2, r3, r7 ; c2 = b1_1 - c1_1
mov r9, r9, asr #3 ; >> 3
strh r9, [r1, #10] ; op[5]
addmi r2, r2, #1 ; += a2 < 0
add r2, r2, #3 ; += 3
subs r9, r0, r5 ; d2 = a1_1 - d1_1
mov r2, r2, asr #3 ; >> 3
strh r2, [r1, #18] ; op[9]
addmi r9, r9, #1 ; += a2 < 0
add r9, r9, #3 ; += 3
smusd r4, r4, lr ; A2 = b1<<2 - c1<<2
smusd r8, r8, lr ; C2 = b1<<2 - c1<<2
mov r9, r9, asr #3 ; >> 3
strh r9, [r1, #26] ; op[13]
; op[2,6,10,14]
add r11, r4, r8 ; a1_2 = A2 + C2
sub r12, r4, r8 ; b1_2 = A2 - C2
smusd r6, r6, lr ; B2 = b1<<2 - c1<<2
smusd r10, r10, lr ; D2 = b1<<2 - c1<<2
add r4, r6, r10 ; d1_2 = B2 + D2
sub r8, r6, r10 ; c1_2 = B2 - D2
adds r2, r11, r4 ; a2 = a1_2 + d1_2
addmi r2, r2, #1 ; += a2 < 0
add r2, r2, #3 ; += 3
adds r9, r12, r8 ; b2 = b1_2 + c1_2
mov r2, r2, asr #3 ; >> 3
strh r2, [r1, #4] ; op[2]
addmi r9, r9, #1 ; += a2 < 0
add r9, r9, #3 ; += 3
subs r2, r12, r8 ; c2 = b1_2 - c1_2
mov r9, r9, asr #3 ; >> 3
strh r9, [r1, #12] ; op[6]
addmi r2, r2, #1 ; += a2 < 0
add r2, r2, #3 ; += 3
subs r9, r11, r4 ; d2 = a1_2 - d1_2
mov r2, r2, asr #3 ; >> 3
strh r2, [r1, #20] ; op[10]
addmi r9, r9, #1 ; += a2 < 0
add r9, r9, #3 ; += 3
mov r9, r9, asr #3 ; >> 3
strh r9, [r1, #28] ; op[14]
ldmia sp!, {r4 - r11, pc}
ENDP ; |vp8_short_walsh4x4_armv6|
c00040004
DCD 0x00040004
END
......@@ -13,12 +13,10 @@
#if HAVE_ARMV6
void vp8_fast_fdct8x4_armv6(short *input, short *output, int pitch)
void vp8_short_fdct8x4_armv6(short *input, short *output, int pitch)
{
vp8_fast_fdct4x4_armv6(input, output, pitch);
vp8_fast_fdct4x4_armv6(input + 4, output + 16, pitch);
vp8_short_fdct4x4_armv6(input, output, pitch);
vp8_short_fdct4x4_armv6(input + 4, output + 16, pitch);
}
#endif /* HAVE_ARMV6 */
......@@ -14,18 +14,24 @@
#if HAVE_ARMV6
extern prototype_fdct(vp8_short_walsh4x4_armv6);
extern prototype_fdct(vp8_fast_fdct4x4_armv6);
extern prototype_fdct(vp8_fast_fdct8x4_armv6);
extern prototype_fdct(vp8_short_fdct4x4_armv6);
extern prototype_fdct(vp8_short_fdct8x4_armv6);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_fdct_walsh_short4x4
#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_armv6
#undef vp8_fdct_short4x4
#define vp8_fdct_short4x4 vp8_short_fdct4x4_armv6
#undef vp8_fdct_short8x4
#define vp8_fdct_short8x4 vp8_short_fdct8x4_armv6
#undef vp8_fdct_fast4x4
#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_armv6
#define vp8_fdct_fast4x4 vp8_short_fdct4x4_armv6
#undef vp8_fdct_fast8x4
#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_armv6
#define vp8_fdct_fast8x4 vp8_short_fdct8x4_armv6
#endif
#endif /* HAVE_ARMV6 */
......@@ -45,10 +51,10 @@ extern prototype_fdct(vp8_short_walsh4x4_neon);
#define vp8_fdct_short8x4 vp8_short_fdct8x4_neon
#undef vp8_fdct_fast4x4
#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_neon
#define vp8_fdct_fast4x4 vp8_short_fdct4x4_neon
#undef vp8_fdct_fast8x4
#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_neon
#define vp8_fdct_fast8x4 vp8_short_fdct8x4_neon
#undef vp8_fdct_walsh_short4x4
#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_neon
......
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_fast_fdct4x4_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void vp8_fast_fdct4x4_c(short *input, short *output, int pitch);
;NOTE:
;The input *src_diff. src_diff is calculated as:
;diff_ptr[c] = src_ptr[c] - pred_ptr[c]; (in Subtract* function)
;In which *src_ptr and *pred_ptr both are unsigned char.
;Therefore, *src_diff should be in the range of [-255, 255].
;CAUTION:
;The input values of 25th block are set in vp8_build_dcblock function, which are out of [-255, 255].
;But, VP8 encoder only uses vp8_short_fdct4x4_c for 25th block, not vp8_fast_fdct4x4_c. That makes
;it ok for assuming *input in [-255, 255] in vp8_fast_fdct4x4_c, but not ok in vp8_short_fdct4x4_c.
|vp8_fast_fdct4x4_neon| PROC
vld1.16 {d2}, [r0], r2 ;load input
ldr r12, _ffdct_coeff_
vld1.16 {d3}, [r0], r2
vld1.16 {d4}, [r0], r2
vld1.16 {d0}, [r12]
vld1.16 {d5}, [r0], r2