Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
c1629ca5
Commit
c1629ca5
authored
Nov 21, 2015
by
Yaowu Xu
Browse files
Merge branch 'master' into nextgenv2
parents
6e9ed38d
16eba81f
Changes
9
Hide whitespace changes
Inline
Side-by-side
vp10/encoder/rdopt.c
View file @
c1629ca5
...
...
@@ -4440,9 +4440,9 @@ static void rd_variance_adjustment(VP10_COMP *cpi,
?
(
source_variance
-
recon_variance
)
:
(
recon_variance
-
source_variance
);
var_error
=
(
200
*
source_variance
*
recon_variance
)
/
((
source_variance
*
source_variance
)
+
(
recon_variance
*
recon_variance
));
var_error
=
(
(
int64_t
)
200
*
source_variance
*
recon_variance
)
/
((
(
int64_t
)
source_variance
*
source_variance
)
+
(
(
int64_t
)
recon_variance
*
recon_variance
));
var_error
=
100
-
var_error
;
}
...
...
vp9/encoder/vp9_context_tree.h
View file @
c1629ca5
...
...
@@ -60,6 +60,7 @@ typedef struct {
#if CONFIG_VP9_TEMPORAL_DENOISING
unsigned
int
newmv_sse
;
unsigned
int
zeromv_sse
;
unsigned
int
zeromv_lastref_sse
;
PREDICTION_MODE
best_sse_inter_mode
;
int_mv
best_sse_mv
;
MV_REFERENCE_FRAME
best_reference_frame
;
...
...
vp9/encoder/vp9_denoiser.c
View file @
c1629ca5
...
...
@@ -224,14 +224,19 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
}
else
{
// Otherwise, use the zero reference frame.
frame
=
ctx
->
best_zeromv_reference_frame
;
mbmi
->
ref_frame
[
0
]
=
ctx
->
best_zeromv_reference_frame
;
ctx
->
newmv_sse
=
ctx
->
zeromv_sse
;
// Bias to last reference.
if
(
frame
!=
LAST_FRAME
&&
((
ctx
->
zeromv_lastref_sse
<
(
5
*
ctx
->
zeromv_sse
)
>>
2
)
||
denoiser
->
denoising_level
>=
kDenHigh
))
{
frame
=
LAST_FRAME
;
ctx
->
newmv_sse
=
ctx
->
zeromv_lastref_sse
;
}
mbmi
->
ref_frame
[
0
]
=
frame
;
mbmi
->
mode
=
ZEROMV
;
mbmi
->
mv
[
0
].
as_int
=
0
;
ctx
->
best_sse_inter_mode
=
ZEROMV
;
ctx
->
best_sse_mv
.
as_int
=
0
;
ctx
->
newmv_sse
=
ctx
->
zeromv_sse
;
}
if
(
ctx
->
newmv_sse
>
sse_thresh
(
bs
,
increase_denoising
))
{
...
...
@@ -462,6 +467,8 @@ void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi, unsigned int sse,
if
(
mbmi
->
mv
[
0
].
as_int
==
0
&&
sse
<
ctx
->
zeromv_sse
)
{
ctx
->
zeromv_sse
=
sse
;
ctx
->
best_zeromv_reference_frame
=
mbmi
->
ref_frame
[
0
];
if
(
mbmi
->
ref_frame
[
0
]
==
LAST_FRAME
)
ctx
->
zeromv_lastref_sse
=
sse
;
}
if
(
mbmi
->
mv
[
0
].
as_int
!=
0
&&
sse
<
ctx
->
newmv_sse
)
{
...
...
vp9/encoder/vp9_ratectrl.c
View file @
c1629ca5
...
...
@@ -1558,7 +1558,7 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
VP9_COMMON
*
const
cm
=
&
cpi
->
common
;
RATE_CONTROL
*
const
rc
=
&
cpi
->
rc
;
int
target
=
rc
->
avg_frame_bandwidth
;
const
int
layer
=
LAYER_IDS_TO_IDX
(
cpi
->
svc
.
spatial_layer_id
,
int
layer
=
LAYER_IDS_TO_IDX
(
cpi
->
svc
.
spatial_layer_id
,
cpi
->
svc
.
temporal_layer_id
,
cpi
->
svc
.
number_temporal_layers
);
if
((
cm
->
current_video_frame
==
0
)
||
...
...
@@ -1573,8 +1573,10 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
cpi
->
ref_frame_flags
&=
(
~
VP9_LAST_FLAG
&
~
VP9_GOLD_FLAG
&
~
VP9_ALT_FLAG
);
}
else
if
(
is_one_pass_cbr_svc
(
cpi
))
{
cpi
->
svc
.
layer_context
[
layer
].
is_key_frame
=
1
;
reset_temporal_layer_to_zero
(
cpi
);
layer
=
LAYER_IDS_TO_IDX
(
cpi
->
svc
.
spatial_layer_id
,
cpi
->
svc
.
temporal_layer_id
,
cpi
->
svc
.
number_temporal_layers
);
cpi
->
svc
.
layer_context
[
layer
].
is_key_frame
=
1
;
cpi
->
ref_frame_flags
&=
(
~
VP9_LAST_FLAG
&
~
VP9_GOLD_FLAG
&
~
VP9_ALT_FLAG
);
// Assumption here is that LAST_FRAME is being updated for a keyframe.
...
...
vp9/encoder/vp9_rdopt.c
View file @
c1629ca5
...
...
@@ -2831,9 +2831,9 @@ static void rd_variance_adjustment(VP9_COMP *cpi,
?
(
source_variance
-
recon_variance
)
:
(
recon_variance
-
source_variance
);
var_error
=
(
200
*
source_variance
*
recon_variance
)
/
((
source_variance
*
source_variance
)
+
(
recon_variance
*
recon_variance
));
var_error
=
(
(
int64_t
)
200
*
source_variance
*
recon_variance
)
/
((
(
int64_t
)
source_variance
*
source_variance
)
+
(
(
int64_t
)
recon_variance
*
recon_variance
));
var_error
=
100
-
var_error
;
}
...
...
vpx_dsp/bitreader_buffer.c
View file @
c1629ca5
...
...
@@ -44,6 +44,6 @@ int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb,
int
vpx_rb_read_inv_signed_literal
(
struct
vpx_read_bit_buffer
*
rb
,
int
bits
)
{
const
int
nbits
=
sizeof
(
unsigned
)
*
8
-
bits
-
1
;
const
unsigned
value
=
vpx_rb_read_literal
(
rb
,
bits
+
1
)
<<
nbits
;
const
unsigned
value
=
(
unsigned
)
vpx_rb_read_literal
(
rb
,
bits
+
1
)
<<
nbits
;
return
((
int
)
value
)
>>
nbits
;
}
vpx_dsp/vpx_dsp_rtcd_defs.pl
View file @
c1629ca5
...
...
@@ -893,7 +893,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize
qw/vpx_idct32x32_1024_add sse2 neon dspr2 msa/
;
add_proto
qw/void vpx_idct32x32_34_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride
";
specialize
qw/vpx_idct32x32_34_add sse2 neon_asm dspr2 msa/
;
specialize
qw/vpx_idct32x32_34_add sse2 neon_asm dspr2 msa/
,
"
$ssse3_x86_64_x86inc
"
;
# Need to add 34 eob idct32x32 neon implementation.
$vpx_idct32x32_34_add_neon_asm
=
vpx_idct32x32_1024_add_neon
;
...
...
vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
View file @
c1629ca5
...
...
@@ -17,18 +17,46 @@
SECTION
_RODATA
pw_11585x2:
times
8
dw
23170
pw_m2404x2:
times
8
dw
-
2404
*
2
pw_m4756x2:
times
8
dw
-
4756
*
2
pw_m5520x2:
times
8
dw
-
5520
*
2
pw_16364x2:
times
8
dw
16364
*
2
pw_16305x2:
times
8
dw
16305
*
2
pw_16207x2:
times
8
dw
16207
*
2
pw_16069x2:
times
8
dw
16069
*
2
pw_15893x2:
times
8
dw
15893
*
2
pw_15679x2:
times
8
dw
15679
*
2
pw_15426x2:
times
8
dw
15426
*
2
pw__3981x2:
times
8
dw
3981
*
2
pw__3196x2:
times
8
dw
3196
*
2
pw__1606x2:
times
8
dw
1606
*
2
pw___804x2:
times
8
dw
804
*
2
pd_8192:
times
4
dd
8192
pw_32:
times
8
dw
32
pw_16:
times
8
dw
16
%macro TRANSFORM_COEFFS 2
pw_
%
1
_
%
2
:
dw
%
1
,
%
2
,
%
1
,
%
2
,
%
1
,
%
2
,
%
1
,
%
2
pw_m
%
2
_
%
1
:
dw
-%
2
,
%
1
,
-%
2
,
%
1
,
-%
2
,
%
1
,
-%
2
,
%
1
pw_m
%
1
_m
%
2
:
dw
-%
1
,
-%
2
,
-%
1
,
-%
2
,
-%
1
,
-%
2
,
-%
1
,
-%
2
%endmacro
TRANSFORM_COEFFS
6270
,
15137
TRANSFORM_COEFFS
3196
,
16069
TRANSFORM_COEFFS
13623
,
9102
; constants for 32x32_34
TRANSFORM_COEFFS
804
,
16364
TRANSFORM_COEFFS
15426
,
5520
TRANSFORM_COEFFS
3981
,
15893
TRANSFORM_COEFFS
16207
,
2404
TRANSFORM_COEFFS
1606
,
16305
TRANSFORM_COEFFS
15679
,
4756
TRANSFORM_COEFFS
11585
,
11585
%macro PAIR_PP_COEFFS 2
dpw_
%
1
_
%
2
:
dw
%
1
,
%
1
,
%
1
,
%
1
,
%
2
,
%
2
,
%
2
,
%
2
%endmacro
...
...
@@ -80,6 +108,15 @@ SECTION .text
packssdw
m
%
2
,
m
%
6
%endmacro
%macro BUTTERFLY_4Xmm 7
; dst1, dst2, coef1, coef2, round, tmp1, tmp2
punpckhwd
m
%
6
,
m
%
2
,
m
%
1
MUL_ADD_2X
%
7
,
%
6
,
%
6
,
%
5
,
[
pw_m
%
4
_
%
3
],
[
pw_m
%
3
_m
%
4
]
punpcklwd
m
%
2
,
m
%
1
MUL_ADD_2X
%
1
,
%
2
,
%
2
,
%
5
,
[
pw_m
%
4
_
%
3
],
[
pw_m
%
3
_m
%
4
]
packssdw
m
%
1
,
m
%
7
packssdw
m
%
2
,
m
%
6
%endmacro
; matrix transpose
%macro INTERLEAVE_2X 4
punpckh
%
1
m
%
4
,
m
%
2
,
m
%
3
...
...
@@ -298,4 +335,453 @@ cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
RET
%define idx0 16 * 0
%define idx1 16 * 1
%define idx2 16 * 2
%define idx3 16 * 3
%define idx4 16 * 4
%define idx5 16 * 5
%define idx6 16 * 6
%define idx7 16 * 7
%define idx8 16 * 0
%define idx9 16 * 1
%define idx10 16 * 2
%define idx11 16 * 3
%define idx12 16 * 4
%define idx13 16 * 5
%define idx14 16 * 6
%define idx15 16 * 7
%define idx16 16 * 0
%define idx17 16 * 1
%define idx18 16 * 2
%define idx19 16 * 3
%define idx20 16 * 4
%define idx21 16 * 5
%define idx22 16 * 6
%define idx23 16 * 7
%define idx24 16 * 0
%define idx25 16 * 1
%define idx26 16 * 2
%define idx27 16 * 3
%define idx28 16 * 4
%define idx29 16 * 5
%define idx30 16 * 6
%define idx31 16 * 7
%macro IDCT32X32_34x 4
; FROM idct32x32_add_neon.asm
;
; Instead of doing the transforms stage by stage, it is done by loading
; some input values and doing as many stages as possible to minimize the
; storing/loading of intermediate results. To fit within registers, the
; final coefficients are cut into four blocks:
; BLOCK A: 16-19,28-31
; BLOCK B: 20-23,24-27
; BLOCK C: 8-11,12-15
; BLOCK D: 0-3,4-7
; Blocks A and C are straight calculation through the various stages. In
; block B, further calculations are performed using the results from
; block A. In block D, further calculations are performed using the results
; from block C and then the final calculations are done using results from
; block A and B which have been combined at the end of block B.
;
; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova
m11
,
m1
pmulhrsw
m1
,
[
pw___804x2
]
; stp1_16
mova
[
r4
+
0
],
m0
pmulhrsw
m11
,
[
pw_16364x2
]
; stp2_31
mova
[
r4
+
16
*
2
],
m2
mova
m12
,
m7
pmulhrsw
m7
,
[
pw_15426x2
]
; stp1_28
mova
[
r4
+
16
*
4
],
m4
pmulhrsw
m12
,
[
pw_m5520x2
]
; stp2_19
mova
[
r4
+
16
*
6
],
m6
; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova
m2
,
m1
; stp1_16
mova
m0
,
m11
; stp1_31
mova
m4
,
m7
; stp1_28
mova
m15
,
m12
; stp1_19
; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BUTTERFLY_4X
0
,
2
,
3196
,
16069
,
m8
,
9
,
10
; stp1_17, stp1_30
BUTTERFLY_4Xmm
4
,
15
,
3196
,
16069
,
m8
,
9
,
10
; stp1_29, stp1_18
; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB
1
,
12
,
9
; stp2_16, stp2_19
SUM_SUB
0
,
15
,
9
; stp2_17, stp2_18
SUM_SUB
11
,
7
,
9
; stp2_31, stp2_28
SUM_SUB
2
,
4
,
9
; stp2_30, stp2_29
; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BUTTERFLY_4X
4
,
15
,
6270
,
15137
,
m8
,
9
,
10
; stp1_18, stp1_29
BUTTERFLY_4X
7
,
12
,
6270
,
15137
,
m8
,
9
,
10
; stp1_19, stp1_28
; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova
m6
,
m5
pmulhrsw
m5
,
[
pw__3981x2
]
; stp1_20
mova
[
stp
+
%
4
+
idx28
],
m12
mova
[
stp
+
%
4
+
idx29
],
m15
pmulhrsw
m6
,
[
pw_15893x2
]
; stp2_27
mova
[
stp
+
%
4
+
idx30
],
m2
mova
m2
,
m3
pmulhrsw
m3
,
[
pw_m2404x2
]
; stp1_23
mova
[
stp
+
%
4
+
idx31
],
m11
pmulhrsw
m2
,
[
pw_16207x2
]
; stp2_24
; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova
m13
,
m5
; stp1_20
mova
m14
,
m6
; stp1_27
mova
m15
,
m3
; stp1_23
mova
m11
,
m2
; stp1_24
; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BUTTERFLY_4X
14
,
13
,
13623
,
9102
,
m8
,
9
,
10
; stp1_21, stp1_26
BUTTERFLY_4Xmm
11
,
15
,
13623
,
9102
,
m8
,
9
,
10
; stp1_25, stp1_22
; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB
3
,
5
,
9
; stp2_23, stp2_20
SUM_SUB
15
,
14
,
9
; stp2_22, stp2_21
SUM_SUB
2
,
6
,
9
; stp2_24, stp2_27
SUM_SUB
11
,
13
,
9
; stp2_25, stp2_26
; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BUTTERFLY_4Xmm
6
,
5
,
6270
,
15137
,
m8
,
9
,
10
; stp1_27, stp1_20
BUTTERFLY_4Xmm
13
,
14
,
6270
,
15137
,
m8
,
9
,
10
; stp1_26, stp1_21
; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB
1
,
3
,
9
; stp2_16, stp2_23
SUM_SUB
0
,
15
,
9
; stp2_17, stp2_22
SUM_SUB
4
,
14
,
9
; stp2_18, stp2_21
SUM_SUB
7
,
5
,
9
; stp2_19, stp2_20
mova
[
stp
+
%
3
+
idx16
],
m1
mova
[
stp
+
%
3
+
idx17
],
m0
mova
[
stp
+
%
3
+
idx18
],
m4
mova
[
stp
+
%
3
+
idx19
],
m7
mova
m4
,
[
stp
+
%
4
+
idx28
]
mova
m7
,
[
stp
+
%
4
+
idx29
]
mova
m10
,
[
stp
+
%
4
+
idx30
]
mova
m12
,
[
stp
+
%
4
+
idx31
]
SUM_SUB
4
,
6
,
9
; stp2_28, stp2_27
SUM_SUB
7
,
13
,
9
; stp2_29, stp2_26
SUM_SUB
10
,
11
,
9
; stp2_30, stp2_25
SUM_SUB
12
,
2
,
9
; stp2_31, stp2_24
mova
[
stp
+
%
4
+
idx28
],
m4
mova
[
stp
+
%
4
+
idx29
],
m7
mova
[
stp
+
%
4
+
idx30
],
m10
mova
[
stp
+
%
4
+
idx31
],
m12
; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
%if 0
; overflow occurs in SUM_SUB when using test streams
mova
m10
,
[
pw_11585x2
]
SUM_SUB
6
,
5
,
9
pmulhrsw
m6
,
m10
; stp1_27
pmulhrsw
m5
,
m10
; stp1_20
SUM_SUB
13
,
14
,
9
pmulhrsw
m13
,
m10
; stp1_26
pmulhrsw
m14
,
m10
; stp1_21
SUM_SUB
11
,
15
,
9
pmulhrsw
m11
,
m10
; stp1_25
pmulhrsw
m15
,
m10
; stp1_22
SUM_SUB
2
,
3
,
9
pmulhrsw
m2
,
m10
; stp1_24
pmulhrsw
m3
,
m10
; stp1_23
%else
BUTTERFLY_4X
6
,
5
,
11585
,
11585
,
m8
,
9
,
10
; stp1_20, stp1_27
SWAP
6
,
5
BUTTERFLY_4X
13
,
14
,
11585
,
11585
,
m8
,
9
,
10
; stp1_21, stp1_26
SWAP
13
,
14
BUTTERFLY_4X
11
,
15
,
11585
,
11585
,
m8
,
9
,
10
; stp1_22, stp1_25
SWAP
11
,
15
BUTTERFLY_4X
2
,
3
,
11585
,
11585
,
m8
,
9
,
10
; stp1_23, stp1_24
SWAP
2
,
3
%endif
mova
[
stp
+
%
4
+
idx24
],
m2
mova
[
stp
+
%
4
+
idx25
],
m11
mova
[
stp
+
%
4
+
idx26
],
m13
mova
[
stp
+
%
4
+
idx27
],
m6
; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;
; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova
m0
,
[
rsp
+
transposed_in
+
16
*
2
]
mova
m6
,
[
rsp
+
transposed_in
+
16
*
6
]
mova
m1
,
m0
pmulhrsw
m0
,
[
pw__1606x2
]
; stp1_8
mova
[
stp
+
%
3
+
idx20
],
m5
mova
[
stp
+
%
3
+
idx21
],
m14
pmulhrsw
m1
,
[
pw_16305x2
]
; stp2_15
mova
[
stp
+
%
3
+
idx22
],
m15
mova
m7
,
m6
pmulhrsw
m7
,
[
pw_m4756x2
]
; stp2_11
mova
[
stp
+
%
3
+
idx23
],
m3
pmulhrsw
m6
,
[
pw_15679x2
]
; stp1_12
; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova
m3
,
m0
; stp1_8
mova
m2
,
m1
; stp1_15
; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BUTTERFLY_4X
2
,
3
,
6270
,
15137
,
m8
,
9
,
10
; stp1_9, stp1_14
mova
m4
,
m7
; stp1_11
mova
m5
,
m6
; stp1_12
BUTTERFLY_4Xmm
5
,
4
,
6270
,
15137
,
m8
,
9
,
10
; stp1_13, stp1_10
; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB
0
,
7
,
9
; stp1_8, stp1_11
SUM_SUB
2
,
4
,
9
; stp1_9, stp1_10
SUM_SUB
1
,
6
,
9
; stp1_15, stp1_12
SUM_SUB
3
,
5
,
9
; stp1_14, stp1_13
; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
%if 0
; overflow occurs in SUM_SUB when using test streams
mova
m10
,
[
pw_11585x2
]
SUM_SUB
5
,
4
,
9
pmulhrsw
m5
,
m10
; stp1_13
pmulhrsw
m4
,
m10
; stp1_10
SUM_SUB
6
,
7
,
9
pmulhrsw
m6
,
m10
; stp1_12
pmulhrsw
m7
,
m10
; stp1_11
%else
BUTTERFLY_4X
5
,
4
,
11585
,
11585
,
m8
,
9
,
10
; stp1_10, stp1_13
SWAP
5
,
4
BUTTERFLY_4X
6
,
7
,
11585
,
11585
,
m8
,
9
,
10
; stp1_11, stp1_12
SWAP
6
,
7
%endif
; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova
[
stp
+
%
2
+
idx8
],
m0
mova
[
stp
+
%
2
+
idx9
],
m2
mova
[
stp
+
%
2
+
idx10
],
m4
mova
[
stp
+
%
2
+
idx11
],
m7
; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;
; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;
; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova
m11
,
[
rsp
+
transposed_in
+
16
*
4
]
mova
m12
,
m11
pmulhrsw
m11
,
[
pw__3196x2
]
; stp1_4
pmulhrsw
m12
,
[
pw_16069x2
]
; stp1_7
; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova
m0
,
[
rsp
+
transposed_in
+
16
*
0
]
mova
m10
,
[
pw_11585x2
]
mova
m7
,
m0
pmulhrsw
m0
,
m10
; stp1_1
pmulhrsw
m7
,
m10
; stp1_0
mova
m14
,
m11
; stp1_4
mova
m13
,
m12
; stp1_7
; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
%if 0
; overflow occurs in SUM_SUB when using test streams
SUM_SUB
13
,
14
,
9
pmulhrsw
m13
,
m10
; stp1_6
pmulhrsw
m14
,
m10
; stp1_5
%else
BUTTERFLY_4X
13
,
14
,
11585
,
11585
,
m8
,
9
,
10
; stp1_5, stp1_6
SWAP
13
,
14
%endif
mova
m4
,
m0
; stp1_1
mova
m2
,
m7
; stp1_0
; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB
0
,
12
,
9
; stp1_0, stp1_7
SUM_SUB
7
,
13
,
9
; stp1_1, stp1_6
SUM_SUB
2
,
14
,
9
; stp1_2, stp1_5
SUM_SUB
4
,
11
,
9
; stp1_3, stp1_4
; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB
0
,
1
,
9
; stp1_0, stp1_15
SUM_SUB
7
,
3
,
9
; stp1_1, stp1_14
SUM_SUB
2
,
5
,
9
; stp1_2, stp1_13
SUM_SUB
4
,
6
,
9
; stp1_3, stp1_12
; 0-3, 28-31 final stage
mova
m15
,
[
stp
+
%
4
+
idx30
]
mova
m10
,
[
stp
+
%
4
+
idx31
]
SUM_SUB
0
,
10
,
9
; stp1_0, stp1_31
SUM_SUB
7
,
15
,
9
; stp1_1, stp1_30
mova
[
stp
+
%
1
+
idx0
],
m0
mova
[
stp
+
%
1
+
idx1
],
m7
mova
[
stp
+
%
4
+
idx30
],
m15
mova
[
stp
+
%
4
+
idx31
],
m10
mova
m7
,
[
stp
+
%
4
+
idx28
]
mova
m0
,
[
stp
+
%
4
+
idx29
]
SUM_SUB
2
,
0
,
9
; stp1_2, stp1_29
SUM_SUB
4
,
7
,
9
; stp1_3, stp1_28
mova
[
stp
+
%
1
+
idx2
],
m2
mova
[
stp
+
%
1
+
idx3
],
m4
mova
[
stp
+
%
4
+
idx28
],
m7
mova
[
stp
+
%
4
+
idx29
],
m0
; 12-15, 16-19 final stage
mova
m0
,
[
stp
+
%
3
+
idx16
]
mova
m7
,
[
stp
+
%
3
+
idx17
]
mova
m2
,
[
stp
+
%
3
+
idx18
]
mova
m4
,
[
stp
+
%
3
+
idx19
]
SUM_SUB
1
,
0
,
9
; stp1_15, stp1_16
SUM_SUB
3
,
7
,
9
; stp1_14, stp1_17
SUM_SUB
5
,
2
,
9
; stp1_13, stp1_18
SUM_SUB
6
,
4
,
9
; stp1_12, stp1_19
mova
[
stp
+
%
2
+
idx12
],
m6
mova
[
stp
+
%
2
+
idx13
],
m5
mova
[
stp
+
%
2
+
idx14
],
m3
mova
[
stp
+
%
2
+
idx15
],
m1
mova
[
stp
+
%
3
+
idx16
],
m0
mova
[
stp
+
%
3
+
idx17
],
m7
mova
[
stp
+
%
3
+
idx18
],
m2
mova
[
stp
+
%
3
+
idx19
],
m4
mova
m4
,
[
stp
+
%
2
+
idx8
]
mova
m5
,
[
stp
+
%
2
+
idx9
]
mova
m6
,
[
stp
+
%
2
+
idx10
]
mova
m7
,
[
stp
+
%
2
+
idx11
]
SUM_SUB
11
,
7
,
9
; stp1_4, stp1_11
SUM_SUB
14
,
6
,
9
; stp1_5, stp1_10
SUM_SUB
13
,
5
,
9
; stp1_6, stp1_9
SUM_SUB
12
,
4
,
9
; stp1_7, stp1_8
; 4-7, 24-27 final stage
mova
m0
,
[
stp
+
%
4
+
idx27
]
mova
m1
,
[
stp
+
%
4
+
idx26
]
mova
m2
,
[
stp
+
%
4
+
idx25
]
mova
m3
,
[
stp
+
%
4
+
idx24
]
SUM_SUB
11
,
0
,
9
; stp1_4, stp1_27
SUM_SUB
14
,
1
,
9
; stp1_5, stp1_26
SUM_SUB
13
,
2
,
9
; stp1_6, stp1_25
SUM_SUB
12
,
3
,
9
; stp1_7, stp1_24
mova
[
stp
+
%
4
+
idx27
],
m0
mova
[
stp
+
%
4
+
idx26
],
m1
mova
[
stp
+
%
4
+
idx25
],
m2
mova
[
stp
+
%
4
+
idx24
],
m3
mova
[
stp
+
%
1
+
idx4
],
m11
mova
[
stp
+
%
1
+
idx5
],
m14
mova
[
stp
+
%
1
+
idx6
],
m13
mova
[
stp
+
%
1
+
idx7
],
m12
; 8-11, 20-23 final stage
mova
m0
,
[
stp
+
%
3
+
idx20
]
mova
m1
,
[
stp
+
%
3
+
idx21
]
mova
m2
,
[
stp
+
%
3
+
idx22
]
mova
m3
,
[
stp
+
%
3
+
idx23
]
SUM_SUB
7
,
0
,
9
; stp1_11, stp_20
SUM_SUB
6
,
1
,
9
; stp1_10, stp_21
SUM_SUB
5
,
2
,
9
; stp1_9, stp_22
SUM_SUB
4
,
3
,
9
; stp1_8, stp_23
mova
[
stp
+
%
2
+
idx8
],
m4
mova
[
stp
+
%
2
+
idx9
],
m5
mova
[
stp
+
%
2
+
idx10
],
m6
mova
[
stp
+
%
2
+
idx11
],
m7
mova
[
stp
+
%
3
+
idx20
],
m0
mova
[
stp
+
%
3
+
idx21
],
m1
mova
[
stp
+
%
3
+
idx22
],
m2
mova
[
stp
+
%
3
+
idx23
],
m3
%endmacro
%macro RECON_AND_STORE 1
mova
m11
,
[
pw_32
]
lea
stp
,
[
rsp
+
%
1
]
mov
r6
,
32
pxor
m8
,
m8
%%recon_and_store:
mova
m0
,
[
stp
+
16
*
32
*
0
]
mova
m1
,
[
stp
+
16
*
32
*
1
]
mova
m2
,
[
stp
+
16
*
32
*
2
]
mova
m3
,
[
stp
+
16
*
32
*
3
]
add
stp
,
16
paddw
m0
,
m11
paddw
m1
,
m11
paddw
m2
,
m11
paddw
m3
,
m11
psraw
m0
,
6
psraw
m1
,
6
psraw
m2
,
6
psraw
m3
,
6
movh
m4
,
[
outputq
+
0
]
movh
m5
,
[
outputq
+
8
]
movh
m6
,
[
outputq
+
16
]
movh
m7
,
[
outputq
+
24
]
punpcklbw
m4
,
m8
punpcklbw
m5
,
m8
punpcklbw
m6
,
m8
punpcklbw
m7
,
m8
paddw
m0
,
m4
paddw
m1
,
m5
paddw
m2
,
m6
paddw
m3
,
m7
packuswb
m0
,
m1
packuswb
m2
,
m3
mova
[
outputq
+
0
],
m0
mova
[
outputq
+
16
],
m2
lea
outputq
,
[
outputq
+
strideq
]
dec
r6
jnz
%%
recon_and_store
%endmacro
%define i32x32_size 16*32*5
%define pass_two_start 16*32*0
%define transposed_in 16*32*4
%define pass_one_start 16*32*0
%define stp r8
INIT_XMM
ss
se3
cglobal
idct32x32_34_add
,
3
,
11
,
16
,
i32x32_size
,
input
,
output
,
stride
mova
m8
,
[
pd_8192
]
lea
stp
,
[
rsp
+
pass_one_start
]
idct32x32_34:
mov
r3
,
inputq
lea
r4
,
[
rsp
+
transposed_in
]
idct32x32_34_transpose:
mova
m0
,
[
r3
+
0
]
mova
m1
,
[
r3
+
16
*
4
]
mova
m2
,
[
r3
+
16
*
8
]
mova
m3
,
[
r3
+
16
*
12
]
mova
m4
,
[
r3
+
16
*
16
]
mova
m5
,
[
r3
+
16
*
20
]
mova
m6
,
[
r3
+
16
*
24
]