Commit edaf8c45 authored by Yaowu Xu's avatar Yaowu Xu
Browse files

Merge branch 'master' into nextgenv2

parents 0e929ef9 79a19469
Adrian Grange <agrange@google.com>
Alex Converse <aconverse@google.com> <alex.converse@gmail.com>
Adrian Grange <agrange@google.com> <agrange@agrange-macbookpro.roam.corp.google.com>
Aℓex Converse <aconverse@google.com>
Aℓex Converse <aconverse@google.com> <alex.converse@gmail.com>
Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
Alpha Lam <hclam@google.com> <hclam@chromium.org>
Deb Mukherjee <debargha@google.com>
Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com>
Guillaume Martres <gmartres@google.com> <smarter3@gmail.com>
Hangyu Kuang <hkuang@google.com>
Hangyu Kuang <hkuang@google.com> <hkuang@hkuang-macbookpro.roam.corp.google.com>
Hui Su <huisu@google.com>
Jacky Chen <jackychen@google.com>
Jim Bankoski <jimbankoski@google.com>
Johann Koenig <johannkoenig@google.com>
Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
Johann Koenig <johannkoenig@google.com> <johannkoenig@dhcp-172-19-7-52.mtv.corp.google.com>
Johann Koenig <johannkoenig@google.com> <johann.koenig@gmail.com>
John Koleszar <jkoleszar@google.com>
Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
Marco Paniconi <marpan@google.com>
......@@ -17,10 +24,13 @@ Pascal Massimino <pascal.massimino@gmail.com>
Paul Wilkins <paulwilkins@google.com>
Ralph Giles <giles@xiph.org> <giles@entropywave.com>
Ralph Giles <giles@xiph.org> <giles@mozilla.com>
Ronald S. Bultje <rsbultje@gmail.com> <rbultje@google.com>
Sami Pietilä <samipietila@google.com>
Tamar Levy <tamar.levy@intel.com>
Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com>
Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
Timothy B. Terriberry <tterribe@xiph.org> Tim Terriberry <tterriberry@mozilla.com>
Tom Finegan <tomfinegan@google.com>
Tom Finegan <tomfinegan@google.com> <tomfinegan@chromium.org>
Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
Yaowu Xu <yaowu@google.com> <yaowu@YAOWU2-W.ad.corp.google.com>
......@@ -5,9 +5,9 @@ Aaron Watry <awatry@gmail.com>
Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
Adam Xu <adam@xuyaowu.com>
Adrian Grange <agrange@google.com>
Aℓex Converse <aconverse@google.com>
Ahmad Sharif <asharif@google.com>
Alexander Voronov <avoronov@graphics.cs.msu.ru>
Alex Converse <aconverse@google.com>
Alexis Ballier <aballier@gentoo.org>
Alok Ahuja <waveletcoeff@gmail.com>
Alpha Lam <hclam@google.com>
......@@ -16,8 +16,10 @@ Ami Fischman <fischman@chromium.org>
Andoni Morales Alastruey <ylatuya@gmail.com>
Andres Mejia <mcitadel@gmail.com>
Andrew Russell <anrussell@google.com>
Angie Chiang <angiebird@google.com>
Aron Rosenberg <arosenberg@logitech.com>
Attila Nagy <attilanagy@google.com>
Brion Vibber <bvibber@wikimedia.org>
changjun.yang <changjun.yang@intel.com>
Charles 'Buck' Krasic <ckrasic@google.com>
chm <chm@rock-chips.com>
......@@ -27,6 +29,7 @@ Deb Mukherjee <debargha@google.com>
Dim Temp <dimtemp0@gmail.com>
Dmitry Kovalev <dkovalev@google.com>
Dragan Mrdjan <dmrdjan@mips.com>
Ed Baker <edward.baker@intel.com>
Ehsan Akhgari <ehsan.akhgari@gmail.com>
Erik Niemeyer <erik.a.niemeyer@intel.com>
Fabio Pedretti <fabio.ped@libero.it>
......@@ -34,6 +37,8 @@ Frank Galligan <fgalligan@google.com>
Fredrik Söderquist <fs@opera.com>
Fritz Koenig <frkoenig@google.com>
Gaute Strokkenes <gaute.strokkenes@broadcom.com>
Geza Lore <gezalore@gmail.com>
Ghislain MARY <ghislainmary2@gmail.com>
Giuseppe Scrivano <gscrivano@gnu.org>
Gordana Cmiljanovic <gordana.cmiljanovic@imgtec.com>
Guillaume Martres <gmartres@google.com>
......@@ -44,7 +49,7 @@ Henrik Lundin <hlundin@google.com>
Hui Su <huisu@google.com>
Ivan Maltz <ivanmaltz@google.com>
Jacek Caban <cjacek@gmail.com>
JackyChen <jackychen@google.com>
Jacky Chen <jackychen@google.com>
James Berry <jamesberry@google.com>
James Yu <james.yu@linaro.org>
James Zern <jzern@google.com>
......@@ -60,9 +65,11 @@ Jingning Han <jingning@google.com>
Joey Parrish <joeyparrish@google.com>
Johann Koenig <johannkoenig@google.com>
John Koleszar <jkoleszar@google.com>
Johnny Klonaris <google@jawknee.com>
John Stark <jhnstrk@gmail.com>
Joshua Bleecher Snyder <josh@treelinelabs.com>
Joshua Litt <joshualitt@google.com>
Julia Robson <juliamrobson@gmail.com>
Justin Clift <justin@salasaga.org>
Justin Lebar <justin.lebar@gmail.com>
KO Myung-Hun <komh@chollian.net>
......@@ -82,6 +89,7 @@ Mike Hommey <mhommey@mozilla.com>
Mikhal Shemer <mikhal@google.com>
Minghai Shang <minghai@google.com>
Morton Jonuschat <yabawock@gmail.com>
Nico Weber <thakis@chromium.org>
Parag Salasakar <img.mips1@gmail.com>
Pascal Massimino <pascal.massimino@gmail.com>
Patrik Westin <patrik.westin@gmail.com>
......@@ -96,7 +104,7 @@ Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
Rafaël Carré <funman@videolan.org>
Ralph Giles <giles@xiph.org>
Rob Bradford <rob@linux.intel.com>
Ronald S. Bultje <rbultje@google.com>
Ronald S. Bultje <rsbultje@gmail.com>
Rui Ueyama <ruiu@google.com>
Sami Pietilä <samipietila@google.com>
Scott Graham <scottmg@chromium.org>
......@@ -104,6 +112,7 @@ Scott LaVarnway <slavarnway@google.com>
Sean McGovern <gseanmcg@gmail.com>
Sergey Ulanov <sergeyu@chromium.org>
Shimon Doodkin <helpmepro1@gmail.com>
Shunyao Li <shunyaoli@google.com>
Stefan Holmer <holmer@google.com>
Suman Sunkara <sunkaras@google.com>
Taekhyun Kim <takim@nvidia.com>
......
xxxx-yy-zz v1.4.0 "Changes for next release"
vpxenc is changed to use VP9 by default.
Encoder controls added for 1 pass SVC.
Decoder control to toggle on/off loopfilter.
2015-11-09 v1.5.0 "Javan Whistling Duck"
This release improves upon the VP9 encoder and speeds up the encoding and
decoding processes.
- Upgrading:
This release is ABI incompatible with 1.4.0. It drops deprecated VP8
controls and adds a variety of VP9 controls for testing.
The vpxenc utility now prefers VP9 by default.
- Enhancements:
Faster VP9 encoding and decoding
Smaller library size by combining functions used by VP8 and VP9
- Bug Fixes:
A variety of fuzzing issues
2015-04-03 v1.4.0 "Indian Runner Duck"
This release includes significant improvements to the VP9 codec.
......
......@@ -260,7 +260,7 @@ OBJS-yes += $(LIBVPX_OBJS)
LIBS-$(if yes,$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a
$(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
SO_VERSION_MAJOR := 2
SO_VERSION_MAJOR := 3
SO_VERSION_MINOR := 0
SO_VERSION_PATCH := 0
ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
......
......@@ -286,11 +286,11 @@ TEST_P(ResizeInternalTest, TestInternalResizeChangeConfig) {
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
}
class ResizeInternalRealtimeTest : public ::libvpx_test::EncoderTest,
class ResizeRealtimeTest : public ::libvpx_test::EncoderTest,
public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
protected:
ResizeInternalRealtimeTest() : EncoderTest(GET_PARAM(0)) {}
virtual ~ResizeInternalRealtimeTest() {}
ResizeRealtimeTest() : EncoderTest(GET_PARAM(0)) {}
virtual ~ResizeRealtimeTest() {}
virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
libvpx_test::Encoder *encoder) {
......@@ -318,8 +318,6 @@ class ResizeInternalRealtimeTest : public ::libvpx_test::EncoderTest,
}
void DefaultConfig() {
cfg_.g_w = 352;
cfg_.g_h = 288;
cfg_.rc_buf_initial_sz = 500;
cfg_.rc_buf_optimal_sz = 600;
cfg_.rc_buf_sz = 1000;
......@@ -346,13 +344,34 @@ class ResizeInternalRealtimeTest : public ::libvpx_test::EncoderTest,
bool change_bitrate_;
};
TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
ResizingVideoSource video;
DefaultConfig();
change_bitrate_ = false;
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
info != frame_info_list_.end(); ++info) {
const unsigned int frame = static_cast<unsigned>(info->pts);
const unsigned int expected_w = ScaleForFrameNumber(frame, kInitialWidth);
const unsigned int expected_h = ScaleForFrameNumber(frame, kInitialHeight);
EXPECT_EQ(expected_w, info->w)
<< "Frame " << frame << " had unexpected width";
EXPECT_EQ(expected_h, info->h)
<< "Frame " << frame << " had unexpected height";
}
}
// Verify the dynamic resizer behavior for real time, 1 pass CBR mode.
// Run at low bitrate, with resize_allowed = 1, and verify that we get
// one resize down event.
TEST_P(ResizeInternalRealtimeTest, TestInternalResizeDown) {
TEST_P(ResizeRealtimeTest, TestInternalResizeDown) {
::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
30, 1, 0, 299);
DefaultConfig();
cfg_.g_w = 352;
cfg_.g_h = 288;
change_bitrate_ = false;
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
......@@ -378,10 +397,12 @@ TEST_P(ResizeInternalRealtimeTest, TestInternalResizeDown) {
// Verify the dynamic resizer behavior for real time, 1 pass CBR mode.
// Start at low target bitrate, raise the bitrate in the middle of the clip,
// scaling-up should occur after bitrate changed.
TEST_P(ResizeInternalRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
30, 1, 0, 299);
30, 1, 0, 359);
DefaultConfig();
cfg_.g_w = 352;
cfg_.g_h = 288;
change_bitrate_ = true;
// Disable dropped frames.
cfg_.rc_dropframe_thresh = 0;
......@@ -524,7 +545,7 @@ VP9_INSTANTIATE_TEST_CASE(ResizeTest,
::testing::Values(::libvpx_test::kRealTime));
VP9_INSTANTIATE_TEST_CASE(ResizeInternalTest,
::testing::Values(::libvpx_test::kOnePassBest));
VP9_INSTANTIATE_TEST_CASE(ResizeInternalRealtimeTest,
VP9_INSTANTIATE_TEST_CASE(ResizeRealtimeTest,
::testing::Values(::libvpx_test::kRealTime),
::testing::Range(5, 9));
VP9_INSTANTIATE_TEST_CASE(ResizeCspTest,
......
......@@ -1979,6 +1979,8 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
if (!cm->error_resilient_mode) {
cm->refresh_frame_context = vpx_rb_read_bit(rb);
cm->frame_parallel_decoding_mode = vpx_rb_read_bit(rb);
if (!cm->frame_parallel_decoding_mode)
vp9_zero(cm->counts);
} else {
cm->refresh_frame_context = 0;
cm->frame_parallel_decoding_mode = 1;
......@@ -2202,8 +2204,6 @@ void vp9_decode_frame(VP9Decoder *pbi,
vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
"Uninitialized entropy context.");
vp9_zero(cm->counts);
xd->corrupted = 0;
new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size);
if (new_fb->corrupted)
......
......@@ -123,72 +123,66 @@ static void update_switchable_interp_probs(VP9_COMMON *cm, vpx_writer *w,
static void pack_mb_tokens(vpx_writer *w,
TOKENEXTRA **tp, const TOKENEXTRA *const stop,
vpx_bit_depth_t bit_depth) {
TOKENEXTRA *p = *tp;
while (p < stop && p->token != EOSB_TOKEN) {
const int t = p->token;
const struct vp9_token *const a = &vp9_coef_encodings[t];
int i = 0;
int v = a->value;
int n = a->len;
const TOKENEXTRA *p;
const vp9_extra_bit *const extra_bits =
#if CONFIG_VP9_HIGHBITDEPTH
const vp9_extra_bit *b;
if (bit_depth == VPX_BITS_12)
b = &vp9_extra_bits_high12[t];
else if (bit_depth == VPX_BITS_10)
b = &vp9_extra_bits_high10[t];
else
b = &vp9_extra_bits[t];
(bit_depth == VPX_BITS_12) ? vp9_extra_bits_high12 :
(bit_depth == VPX_BITS_10) ? vp9_extra_bits_high10 :
vp9_extra_bits;
#else
const vp9_extra_bit *const b = &vp9_extra_bits[t];
vp9_extra_bits;
(void) bit_depth;
#endif // CONFIG_VP9_HIGHBITDEPTH
/* skip one or two nodes */
if (p->skip_eob_node) {
n -= p->skip_eob_node;
i = 2 * p->skip_eob_node;
for (p = *tp; p < stop && p->token != EOSB_TOKEN; ++p) {
if (p->token == EOB_TOKEN) {
vpx_write(w, 0, p->context_tree[0]);
continue;
}
// TODO(jbb): expanding this can lead to big gains. It allows
// much better branch prediction and would enable us to avoid numerous
// lookups and compares.
// If we have a token that's in the constrained set, the coefficient tree
// is split into two treed writes. The first treed write takes care of the
// unconstrained nodes. The second treed write takes care of the
// constrained nodes.
if (t >= TWO_TOKEN && t < EOB_TOKEN) {
int len = UNCONSTRAINED_NODES - p->skip_eob_node;
int bits = v >> (n - len);
vp9_write_tree(w, vp9_coef_tree, p->context_tree, bits, len, i);
vp9_write_tree(w, vp9_coef_con_tree,
vp9_pareto8_full[p->context_tree[PIVOT_NODE] - 1],
v, n - len, 0);
} else {
vp9_write_tree(w, vp9_coef_tree, p->context_tree, v, n, i);
vpx_write(w, 1, p->context_tree[0]);
while (p->token == ZERO_TOKEN) {
vpx_write(w, 0, p->context_tree[1]);
++p;
if (p == stop || p->token == EOSB_TOKEN) {
*tp = (TOKENEXTRA*)(uintptr_t)p + (p->token == EOSB_TOKEN);
return;
}
}
if (b->base_val) {
const int e = p->extra, l = b->len;
if (l) {
const unsigned char *pb = b->prob;
int v = e >> 1;
int n = l; /* number of bits in v, assumed nonzero */
do {
const int bb = (v >> --n) & 1;
vpx_write(w, bb, *pb++);
} while (n);
{
const int t = p->token;
const vpx_prob *const context_tree = p->context_tree;
assert(t != ZERO_TOKEN);
assert(t != EOB_TOKEN);
assert(t != EOSB_TOKEN);
vpx_write(w, 1, context_tree[1]);
if (t == ONE_TOKEN) {
vpx_write(w, 0, context_tree[2]);
vpx_write_bit(w, p->extra & 1);
} else { // t >= TWO_TOKEN && t < EOB_TOKEN
const struct vp9_token *const a = &vp9_coef_encodings[t];
const int v = a->value;
const int n = a->len;
const int e = p->extra;
vpx_write(w, 1, context_tree[2]);
vp9_write_tree(w, vp9_coef_con_tree,
vp9_pareto8_full[context_tree[PIVOT_NODE] - 1], v,
n - UNCONSTRAINED_NODES, 0);
if (t >= CATEGORY1_TOKEN) {
const vp9_extra_bit *const b = &extra_bits[t];
const unsigned char *pb = b->prob;
int v = e >> 1;
int n = b->len; // number of bits in v, assumed nonzero
do {
const int bb = (v >> --n) & 1;
vpx_write(w, bb, *pb++);
} while (n);
}
vpx_write_bit(w, e & 1);
}
vpx_write_bit(w, e & 1);
}
++p;
}
*tp = p + (p->token == EOSB_TOKEN);
*tp = (TOKENEXTRA*)(uintptr_t)p + (p->token == EOSB_TOKEN);
}
static void write_segment_id(vpx_writer *w, const struct segmentation *seg,
......
......@@ -29,6 +29,8 @@ void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne,
ne->value = 0;
ne->count = 0;
ne->thresh = 90;
ne->last_w = 0;
ne->last_h = 0;
if (width * height >= 1920 * 1080) {
ne->thresh = 200;
} else if (width * height >= 1280 * 720) {
......@@ -100,11 +102,17 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
ne->enabled = enable_noise_estimation(cpi);
if (!ne->enabled ||
cm->current_video_frame % frame_period != 0 ||
last_source == NULL) {
last_source == NULL ||
ne->last_w != cm->width ||
ne->last_h != cm->height) {
#if CONFIG_VP9_TEMPORAL_DENOISING
if (cpi->oxcf.noise_sensitivity > 0)
copy_frame(&cpi->denoiser.last_source, cpi->Source);
#endif
if (last_source != NULL) {
ne->last_w = cm->width;
ne->last_h = cm->height;
}
return;
} else {
int num_samples = 0;
......@@ -185,6 +193,8 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
src_u += (src_uvstride << 2) - (cm->mi_cols << 2);
src_v += (src_uvstride << 2) - (cm->mi_cols << 2);
}
ne->last_w = cm->width;
ne->last_h = cm->height;
// Update noise estimate if we have at a minimum number of block samples,
// and avg_est > 0 (avg_est == 0 can happen if the application inputs
// duplicate frames).
......
......@@ -35,6 +35,8 @@ typedef struct noise_estimate {
int value;
int thresh;
int count;
int last_w;
int last_h;
} NOISE_ESTIMATE;
struct VP9_COMP;
......
......@@ -1483,18 +1483,30 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
this_rdc.rate += ref_frame_cost[ref_frame];
this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
// Bias against non-zero (above some threshold) motion for large blocks.
// This is temporary fix to avoid selection of large mv for big blocks.
if (cpi->oxcf.speed > 5 &&
cpi->oxcf.content != VP9E_CONTENT_SCREEN &&
(frame_mv[this_mode][ref_frame].as_mv.row > 64 ||
frame_mv[this_mode][ref_frame].as_mv.row < -64 ||
frame_mv[this_mode][ref_frame].as_mv.col > 64 ||
frame_mv[this_mode][ref_frame].as_mv.col < -64)) {
if (bsize == BLOCK_64X64)
this_rdc.rdcost = this_rdc.rdcost << 1;
else if (bsize >= BLOCK_32X32)
this_rdc.rdcost = 3 * this_rdc.rdcost >> 1;
if (cpi->oxcf.speed >= 5 &&
cpi->oxcf.content != VP9E_CONTENT_SCREEN) {
// Bias against non-zero (above some threshold) motion for large blocks.
// This is temporary fix to avoid selection of large mv for big blocks.
if (frame_mv[this_mode][ref_frame].as_mv.row > 64 ||
frame_mv[this_mode][ref_frame].as_mv.row < -64 ||
frame_mv[this_mode][ref_frame].as_mv.col > 64 ||
frame_mv[this_mode][ref_frame].as_mv.col < -64) {
if (bsize == BLOCK_64X64)
this_rdc.rdcost = this_rdc.rdcost << 1;
else if (bsize >= BLOCK_32X32)
this_rdc.rdcost = 3 * this_rdc.rdcost >> 1;
}
// If noise estimation is enabled, and estimated level is above threshold,
// add a bias to LAST reference with small motion, for large blocks.
if (cpi->noise_estimate.enabled &&
cpi->noise_estimate.level >= kMedium &&
bsize >= BLOCK_32X32 &&
ref_frame == LAST_FRAME &&
frame_mv[this_mode][ref_frame].as_mv.row < 8 &&
frame_mv[this_mode][ref_frame].as_mv.row > -8 &&
frame_mv[this_mode][ref_frame].as_mv.col < 8 &&
frame_mv[this_mode][ref_frame].as_mv.col > -8)
this_rdc.rdcost = 7 * this_rdc.rdcost >> 3;
}
// Skipping checking: test to see if this block can be reconstructed by
......
......@@ -1873,7 +1873,7 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) {
// Resize based on average buffer underflow and QP over some window.
// Ignore samples close to key frame, since QP is usually high after key.
if (cpi->rc.frames_since_key > 1 * cpi->framerate) {
if (cpi->rc.frames_since_key > 2 * cpi->framerate) {
const int window = (int)(4 * cpi->framerate);
cpi->resize_avg_qp += cm->base_qindex;
if (cpi->rc.buffer_level < (int)(30 * rc->optimal_buffer_level / 100))
......
......@@ -893,7 +893,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_idct32x32_1024_add sse2 neon dspr2 msa/;
add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_34_add sse2 neon_asm dspr2 msa/;
specialize qw/vpx_idct32x32_34_add sse2 neon_asm dspr2 msa/, "$ssse3_x86_64_x86inc";
# Need to add 34 eob idct32x32 neon implementation.
$vpx_idct32x32_34_add_neon_asm=vpx_idct32x32_1024_add_neon;
......
......@@ -17,12 +17,31 @@
SECTION_RODATA
pw_11585x2: times 8 dw 23170
pw_m2404x2: times 8 dw -2404*2
pw_m4756x2: times 8 dw -4756*2
pw_m5520x2: times 8 dw -5520*2
pw_16364x2: times 8 dw 16364*2
pw_16305x2: times 8 dw 16305*2
pw_16207x2: times 8 dw 16207*2
pw_16069x2: times 8 dw 16069*2
pw_15893x2: times 8 dw 15893*2
pw_15679x2: times 8 dw 15679*2
pw_15426x2: times 8 dw 15426*2
pw__3981x2: times 8 dw 3981*2
pw__3196x2: times 8 dw 3196*2
pw__1606x2: times 8 dw 1606*2
pw___804x2: times 8 dw 804*2
pd_8192: times 4 dd 8192
pw_32: times 8 dw 32
pw_16: times 8 dw 16
%macro TRANSFORM_COEFFS 2
pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2
pw_m%2_%1: dw -%2, %1, -%2, %1, -%2, %1, -%2, %1
pw_m%1_m%2: dw -%1, -%2, -%1, -%2, -%1, -%2, -%1, -%2
%endmacro
TRANSFORM_COEFFS 6270, 15137
......@@ -80,6 +99,15 @@ SECTION .text
packssdw m%2, m%6
%endmacro
%macro BUTTERFLY_4Xmm 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
punpckhwd m%6, m%2, m%1
MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_m%3_m%4]
punpcklwd m%2, m%1
MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_m%3_m%4]
packssdw m%1, m%7
packssdw m%2, m%6
%endmacro
; matrix transpose
%macro INTERLEAVE_2X 4
punpckh%1 m%4, m%2, m%3
......@@ -298,4 +326,434 @@ cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
RET
%define idx0 16 * 0
%define idx1 16 * 1
%define idx2 16 * 2
%define idx3 16 * 3
%define idx4 16 * 4
%define idx5 16 * 5
%define idx6 16 * 6
%define idx7 16 * 7
%define idx8 16 * 0
%define idx9 16 * 1
%define idx10 16 * 2
%define idx11 16 * 3
%define idx12 16 * 4
%define idx13 16 * 5
%define idx14 16 * 6
%define idx15 16 * 7
%define idx16 16 * 0
%define idx17 16 * 1
%define idx18 16 * 2
%define idx19 16 * 3
%define idx20 16 * 4
%define idx21 16 * 5
%define idx22 16 * 6
%define idx23 16 * 7
%define idx24 16 * 0
%define idx25 16 * 1
%define idx26 16 * 2
%define idx27 16 * 3
%define idx28 16 * 4
%define idx29 16 * 5
%define idx30 16 * 6
%define idx31 16 * 7
%macro IDCT32X32_34x 4
; FROM idct32x32_add_neon.asm
;
; Instead of doing the transforms stage by stage, it is done by loading
; some input values and doing as many stages as possible to minimize the
; storing/loading of intermediate results. To fit within registers, the
; final coefficients are cut into four blocks:
; BLOCK A: 16-19,28-31
; BLOCK B: 20-23,24-27
; BLOCK C: 8-11,12-15
; BLOCK D: 0-3,4-7
; Blocks A and C are straight calculation through the various stages. In
; block B, further calculations are performed using the results from
; block A. In block D, further calculations are performed using the results
; from block C and then the final calculations are done using results from
; block A and B which have been combined at the end of block B.
;
; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m11, m1
pmulhrsw m1, [pw___804x2] ; stp1_16
mova [r4 + 0], m0
pmulhrsw m11, [pw_16364x2] ; stp2_31
mova [r4 + 16 * 2], m2
mova m12, m7
pmulhrsw m7, [pw_15426x2] ; stp1_28
mova [r4 + 16 * 4], m4
pmulhrsw m12, [pw_m5520x2] ; stp2_19
mova [r4 + 16 * 6], m6
; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m2, m1 ; stp1_16
mova m0, m11 ; stp1_31
mova m15, m12 ; stp1_19
mova m4, m7 ; stp1_28
; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30
BUTTERFLY_4Xmm 4, 15, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18
; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB 1, 12, 9 ; stp2_16, stp2_19
SUM_SUB 0, 15, 9 ; stp2_17, stp2_18
SUM_SUB 11, 7, 9 ; stp2_31, stp2_28