Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Guillaume Martres
aom-rav1e
Commits
d5ae4331
Commit
d5ae4331
authored
Jun 12, 2014
by
Jingning Han
Committed by
Gerrit Code Review
Jun 12, 2014
Browse files
Merge "Fast computation path for forward transform and quantization"
parents
893433be
ccba289f
Changes
11
Hide whitespace changes
Inline
Side-by-side
vp9/common/vp9_rtcd_defs.pl
View file @
d5ae4331
...
...
@@ -744,15 +744,27 @@ specialize qw/vp9_fht16x16 sse2/;
add_proto
qw/void vp9_fwht4x4/
,
"
const int16_t *input, int16_t *output, int stride
";
specialize
qw/vp9_fwht4x4/
,
"
$mmx_x86inc
";
add_proto
qw/void vp9_fdct4x4_1/
,
"
const int16_t *input, int16_t *output, int stride
";
specialize
qw/vp9_fdct4x4_1 sse2/
;
add_proto
qw/void vp9_fdct4x4/
,
"
const int16_t *input, int16_t *output, int stride
";
specialize
qw/vp9_fdct4x4 sse2 avx2/
;
add_proto
qw/void vp9_fdct8x8_1/
,
"
const int16_t *input, int16_t *output, int stride
";
specialize
qw/vp9_fdct8x8_1 sse2/
;
add_proto
qw/void vp9_fdct8x8/
,
"
const int16_t *input, int16_t *output, int stride
";
specialize
qw/vp9_fdct8x8 sse2 avx2/
,
"
$ssse3_x86_64
";
add_proto
qw/void vp9_fdct16x16_1/
,
"
const int16_t *input, int16_t *output, int stride
";
specialize
qw/vp9_fdct16x16_1 sse2/
;
add_proto
qw/void vp9_fdct16x16/
,
"
const int16_t *input, int16_t *output, int stride
";
specialize
qw/vp9_fdct16x16 sse2/
;
add_proto
qw/void vp9_fdct32x32_1/
,
"
const int16_t *input, int16_t *output, int stride
";
specialize
qw/vp9_fdct32x32_1 sse2/
;
add_proto
qw/void vp9_fdct32x32/
,
"
const int16_t *input, int16_t *output, int stride
";
specialize
qw/vp9_fdct32x32 sse2 avx2/
;
...
...
vp9/encoder/vp9_block.h
View file @
d5ae4331
...
...
@@ -28,6 +28,7 @@ struct macroblock_plane {
struct
buf_2d
src
;
// Quantizer setings
int16_t
*
quant_fp
;
int16_t
*
quant
;
int16_t
*
quant_shift
;
int16_t
*
zbin
;
...
...
@@ -105,6 +106,9 @@ struct macroblock {
int
use_lp32x32fdct
;
int
skip_encode
;
// skip forward transform and quantization
int
skip_txfm
;
// Used to store sub partition's choices.
MV
pred_mv
[
MAX_REF_FRAMES
];
...
...
vp9/encoder/vp9_context_tree.h
View file @
d5ae4331
...
...
@@ -33,6 +33,7 @@ typedef struct {
int
is_coded
;
int
num_4x4_blk
;
int
skip
;
int
skip_txfm
;
int
best_mode_index
;
int
hybrid_pred_diff
;
int
comp_pred_diff
;
...
...
vp9/encoder/vp9_dct.c
View file @
d5ae4331
...
...
@@ -43,6 +43,17 @@ static void fdct4(const int16_t *input, int16_t *output) {
output
[
3
]
=
fdct_round_shift
(
temp2
);
}
void
vp9_fdct4x4_1_c
(
const
int16_t
*
input
,
int16_t
*
output
,
int
stride
)
{
int
r
,
c
;
int16_t
sum
=
0
;
for
(
r
=
0
;
r
<
4
;
++
r
)
for
(
c
=
0
;
c
<
4
;
++
c
)
sum
+=
input
[
r
*
stride
+
c
];
output
[
0
]
=
sum
<<
3
;
output
[
1
]
=
0
;
}
void
vp9_fdct4x4_c
(
const
int16_t
*
input
,
int16_t
*
output
,
int
stride
)
{
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
...
...
@@ -240,6 +251,17 @@ static void fdct8(const int16_t *input, int16_t *output) {
output
[
7
]
=
fdct_round_shift
(
t3
);
}
void
vp9_fdct8x8_1_c
(
const
int16_t
*
input
,
int16_t
*
output
,
int
stride
)
{
int
r
,
c
;
int16_t
sum
=
0
;
for
(
r
=
0
;
r
<
8
;
++
r
)
for
(
c
=
0
;
c
<
8
;
++
c
)
sum
+=
input
[
r
*
stride
+
c
];
output
[
0
]
=
sum
*
8
;
output
[
1
]
=
0
;
}
void
vp9_fdct8x8_c
(
const
int16_t
*
input
,
int16_t
*
final_output
,
int
stride
)
{
int
i
,
j
;
int16_t
intermediate
[
64
];
...
...
@@ -311,6 +333,17 @@ void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
}
}
void
vp9_fdct16x16_1_c
(
const
int16_t
*
input
,
int16_t
*
output
,
int
stride
)
{
int
r
,
c
;
int16_t
sum
=
0
;
for
(
r
=
0
;
r
<
16
;
++
r
)
for
(
c
=
0
;
c
<
16
;
++
c
)
sum
+=
input
[
r
*
stride
+
c
];
output
[
0
]
=
sum
*
8
;
output
[
1
]
=
0
;
}
void
vp9_fdct16x16_c
(
const
int16_t
*
input
,
int16_t
*
output
,
int
stride
)
{
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
...
...
@@ -1329,6 +1362,17 @@ static void fdct32(const int *input, int *output, int round) {
output
[
31
]
=
dct_32_round
(
step
[
31
]
*
cospi_31_64
+
step
[
16
]
*
-
cospi_1_64
);
}
void
vp9_fdct32x32_1_c
(
const
int16_t
*
input
,
int16_t
*
output
,
int
stride
)
{
int
r
,
c
;
int16_t
sum
=
0
;
for
(
r
=
0
;
r
<
32
;
++
r
)
for
(
c
=
0
;
c
<
32
;
++
c
)
sum
+=
input
[
r
*
stride
+
c
];
output
[
0
]
=
sum
<<
2
;
output
[
1
]
=
0
;
}
void
vp9_fdct32x32_c
(
const
int16_t
*
input
,
int16_t
*
out
,
int
stride
)
{
int
i
,
j
;
int
output
[
32
*
32
];
...
...
vp9/encoder/vp9_encodeframe.c
View file @
d5ae4331
...
...
@@ -1370,6 +1370,7 @@ static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
}
x
->
skip
=
ctx
->
skip
;
x
->
skip_txfm
=
mbmi
->
segment_id
?
0
:
ctx
->
skip_txfm
;
}
static
void
encode_b_rt
(
VP9_COMP
*
cpi
,
const
TileInfo
*
const
tile
,
...
...
@@ -2613,6 +2614,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
nonrd_pick_sb_modes
(
cpi
,
tile
,
mi_row
,
mi_col
,
&
this_rate
,
&
this_dist
,
bsize
);
ctx
->
mic
.
mbmi
=
xd
->
mi
[
0
]
->
mbmi
;
ctx
->
skip_txfm
=
x
->
skip_txfm
;
if
(
this_rate
!=
INT_MAX
)
{
int
pl
=
partition_plane_context
(
xd
,
mi_row
,
mi_col
,
bsize
);
...
...
@@ -2699,6 +2701,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
&
this_rate
,
&
this_dist
,
subsize
);
pc_tree
->
horizontal
[
0
].
mic
.
mbmi
=
xd
->
mi
[
0
]
->
mbmi
;
pc_tree
->
horizontal
[
0
].
skip_txfm
=
x
->
skip_txfm
;
sum_rd
=
RDCOST
(
x
->
rdmult
,
x
->
rddiv
,
sum_rate
,
sum_dist
);
...
...
@@ -2708,6 +2711,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
&
this_rate
,
&
this_dist
,
subsize
);
pc_tree
->
horizontal
[
1
].
mic
.
mbmi
=
xd
->
mi
[
0
]
->
mbmi
;
pc_tree
->
horizontal
[
1
].
skip_txfm
=
x
->
skip_txfm
;
if
(
this_rate
==
INT_MAX
)
{
sum_rd
=
INT64_MAX
;
...
...
@@ -2737,12 +2741,14 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
nonrd_pick_sb_modes
(
cpi
,
tile
,
mi_row
,
mi_col
,
&
this_rate
,
&
this_dist
,
subsize
);
pc_tree
->
vertical
[
0
].
mic
.
mbmi
=
xd
->
mi
[
0
]
->
mbmi
;
pc_tree
->
vertical
[
0
].
skip_txfm
=
x
->
skip_txfm
;
sum_rd
=
RDCOST
(
x
->
rdmult
,
x
->
rddiv
,
sum_rate
,
sum_dist
);
if
(
sum_rd
<
best_rd
&&
mi_col
+
ms
<
cm
->
mi_cols
)
{
load_pred_mv
(
x
,
ctx
);
nonrd_pick_sb_modes
(
cpi
,
tile
,
mi_row
,
mi_col
+
ms
,
&
this_rate
,
&
this_dist
,
subsize
);
pc_tree
->
vertical
[
1
].
mic
.
mbmi
=
xd
->
mi
[
0
]
->
mbmi
;
pc_tree
->
vertical
[
1
].
skip_txfm
=
x
->
skip_txfm
;
if
(
this_rate
==
INT_MAX
)
{
sum_rd
=
INT64_MAX
;
}
else
{
...
...
@@ -2831,14 +2837,17 @@ static void nonrd_use_partition(VP9_COMP *cpi,
case
PARTITION_NONE
:
nonrd_pick_sb_modes
(
cpi
,
tile
,
mi_row
,
mi_col
,
totrate
,
totdist
,
subsize
);
pc_tree
->
none
.
mic
.
mbmi
=
xd
->
mi
[
0
]
->
mbmi
;
pc_tree
->
none
.
skip_txfm
=
x
->
skip_txfm
;
break
;
case
PARTITION_VERT
:
nonrd_pick_sb_modes
(
cpi
,
tile
,
mi_row
,
mi_col
,
totrate
,
totdist
,
subsize
);
pc_tree
->
vertical
[
0
].
mic
.
mbmi
=
xd
->
mi
[
0
]
->
mbmi
;
pc_tree
->
vertical
[
0
].
skip_txfm
=
x
->
skip_txfm
;
if
(
mi_col
+
hbs
<
cm
->
mi_cols
)
{
nonrd_pick_sb_modes
(
cpi
,
tile
,
mi_row
,
mi_col
+
hbs
,
&
rate
,
&
dist
,
subsize
);
pc_tree
->
vertical
[
1
].
mic
.
mbmi
=
xd
->
mi
[
0
]
->
mbmi
;
pc_tree
->
vertical
[
1
].
skip_txfm
=
x
->
skip_txfm
;
if
(
rate
!=
INT_MAX
&&
dist
!=
INT64_MAX
&&
*
totrate
!=
INT_MAX
&&
*
totdist
!=
INT64_MAX
)
{
*
totrate
+=
rate
;
...
...
@@ -2849,10 +2858,12 @@ static void nonrd_use_partition(VP9_COMP *cpi,
case
PARTITION_HORZ
:
nonrd_pick_sb_modes
(
cpi
,
tile
,
mi_row
,
mi_col
,
totrate
,
totdist
,
subsize
);
pc_tree
->
horizontal
[
0
].
mic
.
mbmi
=
xd
->
mi
[
0
]
->
mbmi
;
pc_tree
->
horizontal
[
0
].
skip_txfm
=
x
->
skip_txfm
;
if
(
mi_row
+
hbs
<
cm
->
mi_rows
)
{
nonrd_pick_sb_modes
(
cpi
,
tile
,
mi_row
+
hbs
,
mi_col
,
&
rate
,
&
dist
,
subsize
);
pc_tree
->
horizontal
[
1
].
mic
.
mbmi
=
xd
->
mi
[
0
]
->
mbmi
;
pc_tree
->
horizontal
[
1
].
skip_txfm
=
x
->
skip_txfm
;
if
(
rate
!=
INT_MAX
&&
dist
!=
INT64_MAX
&&
*
totrate
!=
INT_MAX
&&
*
totdist
!=
INT64_MAX
)
{
*
totrate
+=
rate
;
...
...
@@ -3055,6 +3066,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
init_encode_frame_mb_context
(
cpi
);
set_prev_mi
(
cm
);
x
->
skip_txfm
=
0
;
if
(
sf
->
use_nonrd_pick_mode
)
{
// Initialize internal buffer pointers for rtc coding, where non-RD
// mode decision is used and hence no buffer pointer swap needed.
...
...
vp9/encoder/vp9_encodemb.c
View file @
d5ae4331
...
...
@@ -301,6 +301,52 @@ static INLINE void fdct32x32(int rd_transform,
vp9_fdct32x32
(
src
,
dst
,
src_stride
);
}
void
vp9_xform_quant_fp
(
MACROBLOCK
*
x
,
int
plane
,
int
block
,
BLOCK_SIZE
plane_bsize
,
TX_SIZE
tx_size
)
{
MACROBLOCKD
*
const
xd
=
&
x
->
e_mbd
;
const
struct
macroblock_plane
*
const
p
=
&
x
->
plane
[
plane
];
const
struct
macroblockd_plane
*
const
pd
=
&
xd
->
plane
[
plane
];
int16_t
*
const
coeff
=
BLOCK_OFFSET
(
p
->
coeff
,
block
);
int16_t
*
const
qcoeff
=
BLOCK_OFFSET
(
p
->
qcoeff
,
block
);
int16_t
*
const
dqcoeff
=
BLOCK_OFFSET
(
pd
->
dqcoeff
,
block
);
uint16_t
*
const
eob
=
&
p
->
eobs
[
block
];
const
int
diff_stride
=
4
*
num_4x4_blocks_wide_lookup
[
plane_bsize
];
int
i
,
j
;
const
int16_t
*
src_diff
;
txfrm_block_to_raster_xy
(
plane_bsize
,
tx_size
,
block
,
&
i
,
&
j
);
src_diff
=
&
p
->
src_diff
[
4
*
(
j
*
diff_stride
+
i
)];
switch
(
tx_size
)
{
case
TX_32X32
:
vp9_fdct32x32_1
(
src_diff
,
coeff
,
diff_stride
);
vp9_quantize_dc_32x32
(
coeff
,
x
->
skip_block
,
p
->
round
,
p
->
quant_fp
[
0
],
qcoeff
,
dqcoeff
,
pd
->
dequant
[
0
],
eob
);
break
;
case
TX_16X16
:
vp9_fdct16x16_1
(
src_diff
,
coeff
,
diff_stride
);
vp9_quantize_dc
(
coeff
,
x
->
skip_block
,
p
->
round
,
p
->
quant_fp
[
0
],
qcoeff
,
dqcoeff
,
pd
->
dequant
[
0
],
eob
);
break
;
case
TX_8X8
:
vp9_fdct8x8_1
(
src_diff
,
coeff
,
diff_stride
);
vp9_quantize_dc
(
coeff
,
x
->
skip_block
,
p
->
round
,
p
->
quant_fp
[
0
],
qcoeff
,
dqcoeff
,
pd
->
dequant
[
0
],
eob
);
break
;
case
TX_4X4
:
x
->
fwd_txm4x4
(
src_diff
,
coeff
,
diff_stride
);
vp9_quantize_dc
(
coeff
,
x
->
skip_block
,
p
->
round
,
p
->
quant_fp
[
0
],
qcoeff
,
dqcoeff
,
pd
->
dequant
[
0
],
eob
);
break
;
default:
assert
(
0
);
}
}
void
vp9_xform_quant
(
MACROBLOCK
*
x
,
int
plane
,
int
block
,
BLOCK_SIZE
plane_bsize
,
TX_SIZE
tx_size
)
{
MACROBLOCKD
*
const
xd
=
&
x
->
e_mbd
;
...
...
@@ -376,8 +422,19 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
return
;
}
if
(
!
x
->
skip_recode
)
vp9_xform_quant
(
x
,
plane
,
block
,
plane_bsize
,
tx_size
);
if
(
x
->
skip_txfm
==
0
)
{
// full forward transform and quantization
if
(
!
x
->
skip_recode
)
vp9_xform_quant
(
x
,
plane
,
block
,
plane_bsize
,
tx_size
);
}
else
if
(
x
->
skip_txfm
==
2
)
{
// fast path forward transform and quantization
vp9_xform_quant_fp
(
x
,
plane
,
block
,
plane_bsize
,
tx_size
);
}
else
{
// skip forward transform
p
->
eobs
[
block
]
=
0
;
*
a
=
*
l
=
0
;
return
;
}
if
(
x
->
optimize
&&
(
!
x
->
skip_recode
||
!
x
->
skip_optimize
))
{
const
int
ctx
=
combine_entropy_contexts
(
*
a
,
*
l
);
...
...
vp9/encoder/vp9_encodemb.h
View file @
d5ae4331
...
...
@@ -22,7 +22,8 @@ extern "C" {
void
vp9_encode_sb
(
MACROBLOCK
*
x
,
BLOCK_SIZE
bsize
);
void
vp9_encode_sby_pass1
(
MACROBLOCK
*
x
,
BLOCK_SIZE
bsize
);
void
vp9_xform_quant_fp
(
MACROBLOCK
*
x
,
int
plane
,
int
block
,
BLOCK_SIZE
plane_bsize
,
TX_SIZE
tx_size
);
void
vp9_xform_quant
(
MACROBLOCK
*
x
,
int
plane
,
int
block
,
BLOCK_SIZE
plane_bsize
,
TX_SIZE
tx_size
);
...
...
vp9/encoder/vp9_pickmode.c
View file @
d5ae4331
...
...
@@ -156,24 +156,28 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
unsigned
int
sse
;
int
rate
;
int64_t
dist
;
struct
macroblock_plane
*
const
p
=
&
x
->
plane
[
0
];
struct
macroblockd_plane
*
const
pd
=
&
xd
->
plane
[
0
];
const
int
quant
=
pd
->
dequant
[
1
];
unsigned
int
var
=
cpi
->
fn_ptr
[
bsize
].
vf
(
p
->
src
.
buf
,
p
->
src
.
stride
,
pd
->
dst
.
buf
,
pd
->
dst
.
stride
,
&
sse
);
*
var_y
=
var
;
*
sse_y
=
sse
;
if
(
sse
<
pd
->
dequant
[
0
]
*
pd
->
dequant
[
0
]
>>
6
)
x
->
skip_txfm
=
1
;
else
if
(
var
<
quant
*
quant
>>
6
)
x
->
skip_txfm
=
2
;
else
x
->
skip_txfm
=
0
;
// TODO(jingning) This is a temporary solution to account for frames with
// light changes. Need to customize the rate-distortion modeling for non-RD
// mode decision.
if
((
sse
>>
3
)
>
var
)
sse
=
var
;
vp9_model_rd_from_var_lapndz
(
var
+
sse
,
1
<<
num_pels_log2_lookup
[
bsize
],
pd
->
de
quant
[
1
]
>>
3
,
&
rate
,
&
dist
);
quant
>>
3
,
&
rate
,
&
dist
);
*
out_rate_sum
=
rate
;
*
out_dist_sum
=
dist
<<
3
;
}
...
...
@@ -199,6 +203,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
VP9_ALT_FLAG
};
int64_t
best_rd
=
INT64_MAX
;
int64_t
this_rd
=
INT64_MAX
;
int
skip_txfm
=
0
;
int
rate
=
INT_MAX
;
int64_t
dist
=
INT64_MAX
;
...
...
@@ -341,6 +346,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
if
(
cost
<
best_cost
)
{
best_filter
=
filter
;
best_cost
=
cost
;
skip_txfm
=
x
->
skip_txfm
;
}
}
...
...
@@ -349,6 +355,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
dist
=
pf_dist
[
mbmi
->
interp_filter
];
var_y
=
pf_var
[
mbmi
->
interp_filter
];
sse_y
=
pf_sse
[
mbmi
->
interp_filter
];
x
->
skip_txfm
=
skip_txfm
;
}
else
{
mbmi
->
interp_filter
=
(
filter_ref
==
SWITCHABLE
)
?
EIGHTTAP
:
filter_ref
;
vp9_build_inter_predictors_sby
(
xd
,
mi_row
,
mi_col
,
bsize
);
...
...
@@ -438,6 +445,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
best_mode
=
this_mode
;
best_pred_filter
=
mbmi
->
interp_filter
;
best_ref_frame
=
ref_frame
;
skip_txfm
=
x
->
skip_txfm
;
}
if
(
x
->
skip
)
...
...
@@ -450,6 +458,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
mbmi
->
ref_frame
[
0
]
=
best_ref_frame
;
mbmi
->
mv
[
0
].
as_int
=
frame_mv
[
best_mode
][
best_ref_frame
].
as_int
;
xd
->
mi
[
0
]
->
bmi
[
0
].
as_mv
[
0
].
as_int
=
mbmi
->
mv
[
0
].
as_int
;
x
->
skip_txfm
=
skip_txfm
;
// Perform intra prediction search, if the best SAD is above a certain
// threshold.
...
...
@@ -474,6 +483,8 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
mbmi
->
ref_frame
[
0
]
=
INTRA_FRAME
;
mbmi
->
uv_mode
=
this_mode
;
mbmi
->
mv
[
0
].
as_int
=
INVALID_MV
;
}
else
{
x
->
skip_txfm
=
skip_txfm
;
}
}
}
...
...
vp9/encoder/vp9_quantize.c
View file @
d5ae4331
...
...
@@ -19,6 +19,50 @@
#include
"vp9/encoder/vp9_quantize.h"
#include
"vp9/encoder/vp9_rdopt.h"
void
vp9_quantize_dc
(
const
int16_t
*
coeff_ptr
,
int
skip_block
,
const
int16_t
*
round_ptr
,
const
int16_t
quant
,
int16_t
*
qcoeff_ptr
,
int16_t
*
dqcoeff_ptr
,
const
int16_t
dequant_ptr
,
uint16_t
*
eob_ptr
)
{
int
eob
=
-
1
;
if
(
!
skip_block
)
{
const
int
rc
=
0
;
const
int
coeff
=
coeff_ptr
[
rc
];
const
int
coeff_sign
=
(
coeff
>>
31
);
const
int
abs_coeff
=
(
coeff
^
coeff_sign
)
-
coeff_sign
;
int
tmp
=
clamp
(
abs_coeff
+
round_ptr
[
rc
!=
0
],
INT16_MIN
,
INT16_MAX
);
tmp
=
(
tmp
*
quant
)
>>
16
;
qcoeff_ptr
[
rc
]
=
(
tmp
^
coeff_sign
)
-
coeff_sign
;
dqcoeff_ptr
[
rc
]
=
qcoeff_ptr
[
rc
]
*
dequant_ptr
;
if
(
tmp
)
eob
=
0
;
}
*
eob_ptr
=
eob
+
1
;
}
void
vp9_quantize_dc_32x32
(
const
int16_t
*
coeff_ptr
,
int
skip_block
,
const
int16_t
*
round_ptr
,
const
int16_t
quant
,
int16_t
*
qcoeff_ptr
,
int16_t
*
dqcoeff_ptr
,
const
int16_t
dequant_ptr
,
uint16_t
*
eob_ptr
)
{
int
eob
=
-
1
;
if
(
!
skip_block
)
{
const
int
rc
=
0
;
const
int
coeff
=
coeff_ptr
[
rc
];
const
int
coeff_sign
=
(
coeff
>>
31
);
const
int
abs_coeff
=
(
coeff
^
coeff_sign
)
-
coeff_sign
;
int
tmp
=
clamp
(
abs_coeff
+
round_ptr
[
rc
!=
0
],
INT16_MIN
,
INT16_MAX
);
tmp
=
(
tmp
*
quant
)
>>
15
;
qcoeff_ptr
[
rc
]
=
(
tmp
^
coeff_sign
)
-
coeff_sign
;
dqcoeff_ptr
[
rc
]
=
qcoeff_ptr
[
rc
]
*
dequant_ptr
/
2
;
if
(
tmp
)
eob
=
0
;
}
*
eob_ptr
=
eob
+
1
;
}
void
vp9_quantize_b_c
(
const
int16_t
*
coeff_ptr
,
intptr_t
count
,
int
skip_block
,
const
int16_t
*
zbin_ptr
,
const
int16_t
*
round_ptr
,
...
...
@@ -167,6 +211,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
quant
=
i
==
0
?
vp9_dc_quant
(
q
,
cm
->
y_dc_delta_q
)
:
vp9_ac_quant
(
q
,
0
);
invert_quant
(
&
quants
->
y_quant
[
q
][
i
],
&
quants
->
y_quant_shift
[
q
][
i
],
quant
);
quants
->
y_quant_fp
[
q
][
i
]
=
(
1
<<
16
)
/
quant
;
quants
->
y_zbin
[
q
][
i
]
=
ROUND_POWER_OF_TWO
(
qzbin_factor
*
quant
,
7
);
quants
->
y_round
[
q
][
i
]
=
(
qrounding_factor
*
quant
)
>>
7
;
cm
->
y_dequant
[
q
][
i
]
=
quant
;
...
...
@@ -176,6 +221,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
:
vp9_ac_quant
(
q
,
cm
->
uv_ac_delta_q
);
invert_quant
(
&
quants
->
uv_quant
[
q
][
i
],
&
quants
->
uv_quant_shift
[
q
][
i
],
quant
);
quants
->
uv_quant_fp
[
q
][
i
]
=
(
1
<<
16
)
/
quant
;
quants
->
uv_zbin
[
q
][
i
]
=
ROUND_POWER_OF_TWO
(
qzbin_factor
*
quant
,
7
);
quants
->
uv_round
[
q
][
i
]
=
(
qrounding_factor
*
quant
)
>>
7
;
cm
->
uv_dequant
[
q
][
i
]
=
quant
;
...
...
@@ -193,12 +239,14 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
for
(
i
=
2
;
i
<
8
;
i
++
)
{
quants
->
y_quant
[
q
][
i
]
=
quants
->
y_quant
[
q
][
1
];
quants
->
y_quant_fp
[
q
][
i
]
=
quants
->
y_quant_fp
[
q
][
1
];
quants
->
y_quant_shift
[
q
][
i
]
=
quants
->
y_quant_shift
[
q
][
1
];
quants
->
y_zbin
[
q
][
i
]
=
quants
->
y_zbin
[
q
][
1
];
quants
->
y_round
[
q
][
i
]
=
quants
->
y_round
[
q
][
1
];
cm
->
y_dequant
[
q
][
i
]
=
cm
->
y_dequant
[
q
][
1
];
quants
->
uv_quant
[
q
][
i
]
=
quants
->
uv_quant
[
q
][
1
];
quants
->
uv_quant_fp
[
q
][
i
]
=
quants
->
uv_quant_fp
[
q
][
1
];
quants
->
uv_quant_shift
[
q
][
i
]
=
quants
->
uv_quant_shift
[
q
][
1
];
quants
->
uv_zbin
[
q
][
i
]
=
quants
->
uv_zbin
[
q
][
1
];
quants
->
uv_round
[
q
][
i
]
=
quants
->
uv_round
[
q
][
1
];
...
...
@@ -227,6 +275,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
// Y
x
->
plane
[
0
].
quant
=
quants
->
y_quant
[
qindex
];
x
->
plane
[
0
].
quant_fp
=
quants
->
y_quant_fp
[
qindex
];
x
->
plane
[
0
].
quant_shift
=
quants
->
y_quant_shift
[
qindex
];
x
->
plane
[
0
].
zbin
=
quants
->
y_zbin
[
qindex
];
x
->
plane
[
0
].
round
=
quants
->
y_round
[
qindex
];
...
...
@@ -236,6 +285,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
// UV
for
(
i
=
1
;
i
<
3
;
i
++
)
{
x
->
plane
[
i
].
quant
=
quants
->
uv_quant
[
qindex
];
x
->
plane
[
i
].
quant_fp
=
quants
->
uv_quant_fp
[
qindex
];
x
->
plane
[
i
].
quant_shift
=
quants
->
uv_quant_shift
[
qindex
];
x
->
plane
[
i
].
zbin
=
quants
->
uv_zbin
[
qindex
];
x
->
plane
[
i
].
round
=
quants
->
uv_round
[
qindex
];
...
...
vp9/encoder/vp9_quantize.h
View file @
d5ae4331
...
...
@@ -24,6 +24,11 @@ typedef struct {
DECLARE_ALIGNED
(
16
,
int16_t
,
y_zbin
[
QINDEX_RANGE
][
8
]);
DECLARE_ALIGNED
(
16
,
int16_t
,
y_round
[
QINDEX_RANGE
][
8
]);
// TODO(jingning): in progress of re-working the quantization. will decide
// if we want to deprecate the current use of y_quant.
DECLARE_ALIGNED
(
16
,
int16_t
,
y_quant_fp
[
QINDEX_RANGE
][
8
]);
DECLARE_ALIGNED
(
16
,
int16_t
,
uv_quant_fp
[
QINDEX_RANGE
][
8
]);
DECLARE_ALIGNED
(
16
,
int16_t
,
uv_quant
[
QINDEX_RANGE
][
8
]);
DECLARE_ALIGNED
(
16
,
int16_t
,
uv_quant_shift
[
QINDEX_RANGE
][
8
]);
DECLARE_ALIGNED
(
16
,
int16_t
,
uv_zbin
[
QINDEX_RANGE
][
8
]);
...
...
@@ -37,6 +42,14 @@ typedef struct {
#endif
}
QUANTS
;
void
vp9_quantize_dc
(
const
int16_t
*
coeff_ptr
,
int
skip_block
,
const
int16_t
*
round_ptr
,
const
int16_t
quant_ptr
,
int16_t
*
qcoeff_ptr
,
int16_t
*
dqcoeff_ptr
,
const
int16_t
dequant_ptr
,
uint16_t
*
eob_ptr
);
void
vp9_quantize_dc_32x32
(
const
int16_t
*
coeff_ptr
,
int
skip_block
,
const
int16_t
*
round_ptr
,
const
int16_t
quant_ptr
,
int16_t
*
qcoeff_ptr
,
int16_t
*
dqcoeff_ptr
,
const
int16_t
dequant_ptr
,
uint16_t
*
eob_ptr
);
void
vp9_regular_quantize_b_4x4
(
MACROBLOCK
*
x
,
int
plane
,
int
block
,
const
int16_t
*
scan
,
const
int16_t
*
iscan
);
...
...
vp9/encoder/x86/vp9_dct_sse2.c
View file @
d5ae4331
...
...
@@ -12,6 +12,35 @@
#include
"vp9/common/vp9_idct.h"
// for cospi constants
#include
"vpx_ports/mem.h"
void
vp9_fdct4x4_1_sse2
(
const
int16_t
*
input
,
int16_t
*
output
,
int
stride
)
{
__m128i
in0
,
in1
;
__m128i
tmp
;
const
__m128i
zero
=
_mm_setzero_si128
();
in0
=
_mm_loadl_epi64
((
const
__m128i
*
)(
input
+
0
*
stride
));
in1
=
_mm_loadl_epi64
((
const
__m128i
*
)(
input
+
1
*
stride
));
in1
=
_mm_unpacklo_epi64
(
in1
,
_mm_loadl_epi64
((
const
__m128i
*
)
(
input
+
2
*
stride
)));
in0
=
_mm_unpacklo_epi64
(
in0
,
_mm_loadl_epi64
((
const
__m128i
*
)
(
input
+
3
*
stride
)));
tmp
=
_mm_add_epi16
(
in0
,
in1
);
in0
=
_mm_unpacklo_epi16
(
zero
,
tmp
);
in1
=
_mm_unpackhi_epi16
(
zero
,
tmp
);
in0
=
_mm_srai_epi32
(
in0
,
16
);
in1
=
_mm_srai_epi32
(
in1
,
16
);
tmp
=
_mm_add_epi32
(
in0
,
in1
);
in0
=
_mm_unpacklo_epi32
(
tmp
,
zero
);
in1
=
_mm_unpackhi_epi32
(
tmp
,
zero
);
tmp
=
_mm_add_epi32
(
in0
,
in1
);
in0
=
_mm_srli_si128
(
tmp
,
8
);
in1
=
_mm_add_epi32
(
tmp
,
in0
);
in0
=
_mm_slli_epi32
(
in1
,
1
);
_mm_store_si128
((
__m128i
*
)(
output
),
in0
);
}
void
vp9_fdct4x4_sse2
(
const
int16_t
*
input
,
int16_t
*
output
,
int
stride
)
{
// This 2D transform implements 4 vertical 1D transforms followed
// by 4 horizontal 1D transforms. The multiplies and adds are as given
...
...
@@ -377,6 +406,46 @@ void vp9_fht4x4_sse2(const int16_t *input, int16_t *output,
}
}
void
vp9_fdct8x8_1_sse2
(
const
int16_t
*
input
,
int16_t
*
output
,
int
stride
)
{
__m128i
in0
=
_mm_load_si128
((
const
__m128i
*
)(
input
+
0
*
stride
));
__m128i
in1
=
_mm_load_si128
((
const
__m128i
*
)(
input
+
1
*
stride
));
__m128i
in2
=
_mm_load_si128
((
const
__m128i
*
)(
input
+
2
*
stride
));
__m128i
in3
=
_mm_load_si128
((
const
__m128i
*
)(
input
+
3
*
stride
));
__m128i
u0
,
u1
,
sum
;
u0
=
_mm_add_epi16
(
in0
,
in1
);
u1
=
_mm_add_epi16
(
in2
,
in3
);
in0
=
_mm_load_si128
((
const
__m128i
*
)(
input
+
4
*
stride
));
in1
=
_mm_load_si128
((
const
__m128i
*
)(
input
+
5
*
stride
));
in2
=
_mm_load_si128
((
const
__m128i
*
)(
input
+
6
*
stride
));
in3
=
_mm_load_si128
((
const
__m128i
*
)(
input
+
7
*
stride
));
sum
=
_mm_add_epi16
(
u0
,
u1
);
in0
=
_mm_add_epi16
(
in0
,
in1
);
in2
=
_mm_add_epi16
(
in2
,
in3
);
sum
=
_mm_add_epi16
(
sum
,
in0
);
u0
=
_mm_setzero_si128
();
sum
=
_mm_add_epi16
(
sum
,
in2
);
in0
=
_mm_unpacklo_epi16
(
u0
,
sum
);
in1
=
_mm_unpackhi_epi16
(
u0
,
sum
);
in0
=
_mm_srai_epi32
(
in0
,
16
);
in1
=
_mm_srai_epi32
(
in1
,
16
);
sum
=
_mm_add_epi32
(
in0
,
in1
);
in0
=
_mm_unpacklo_epi32
(
sum
,
u0
);
in1
=
_mm_unpackhi_epi32
(
sum
,
u0
);
sum
=
_mm_add_epi32
(
in0
,
in1
);
in0
=
_mm_srli_si128
(
sum
,
8
);
in1
=
_mm_add_epi32
(
sum
,
in0
);
_mm_store_si128
((
__m128i
*
)(
output
),
in1
);
}
void
vp9_fdct8x8_sse2
(
const
int16_t
*
input
,
int16_t
*
output
,
int
stride
)
{
int
pass
;
// Constants
...
...
@@ -1168,6 +1237,74 @@ void vp9_fht8x8_sse2(const int16_t *input, int16_t *output,
}
}
void
vp9_fdct16x16_1_sse2
(
const
int16_t
*
input
,
int16_t
*
output
,
int
stride
)
{
__m128i
in0
,
in1
,
in2
,
in3
;
__m128i
u0
,
u1
;
__m128i
sum
=
_mm_setzero_si128
();
int
i
;
for
(
i
=
0
;
i
<
2
;
++
i
)
{
input
+=
8
*
i
;
in0
=
_mm_load_si128
((
const
__m128i
*
)(
input
+
0
*
stride
));