Commit 87175ed5 authored by Angie Chiang's avatar Angie Chiang

Isolate vp10's inv_txfm from vp9

1) copy following files from vpx_dsp/ to vp10/common/
vp10_inv_txfm.c
vp10_inv_txfm.h
vp10_inv_txfm_sse2.c
vp10_inv_txfm_sse2.h

2) change the function prefix "vpx_" to "vp10_" in above files

3) add unit test at vp10_inv_txfm_test.cc

Change-Id: I206f10f60c8b27d872c84b7482c3bb1d1cb4b913
parent 9511b948
......@@ -167,6 +167,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc
TEST_INTRA_PRED_SPEED_SRCS-$(CONFIG_VP9) := test_intra_pred_speed.cc
TEST_INTRA_PRED_SPEED_SRCS-$(CONFIG_VP9) += ../md5_utils.h ../md5_utils.c
LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_inv_txfm_test.cc
endif # CONFIG_SHARED
include $(SRC_PATH_BARE)/test/test-data.mk
/*
* Copyright (c) 2013 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "./vp10_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "test/acm_random.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
#include "test/util.h"
#include "vp10/common/blockd.h"
#include "vp10/common/scan.h"
#include "vpx/vpx_integer.h"
#include "vp10/common/vp10_inv_txfm.h"
using libvpx_test::ACMRandom;
namespace {
const double PI = 3.141592653589793238462643383279502884;
const double kInvSqrt2 = 0.707106781186547524400844362104;
void reference_idct_1d(const double *in, double *out, int size) {
for (int n = 0; n < size; ++n) {
out[n] = 0;
for (int k = 0; k < size; ++k) {
if (k == 0)
out[n] += kInvSqrt2 * in[k] * cos(PI * (2 * n + 1) * k / (2 * size));
else
out[n] += in[k] * cos(PI * (2 * n + 1) * k / (2 * size));
}
}
}
typedef void (*IdctFuncRef)(const double *in, double *out, int size);
typedef void (*IdctFunc)(const tran_low_t *in, tran_low_t *out);
class TransTestBase {
public:
virtual ~TransTestBase() {}
protected:
void RunInvAccuracyCheck() {
tran_low_t *input = new tran_low_t[txfm_size_];
tran_low_t *output = new tran_low_t[txfm_size_];
double *ref_input = new double[txfm_size_];
double *ref_output = new double[txfm_size_];
ACMRandom rnd(ACMRandom::DeterministicSeed());
const int count_test_block = 5000;
for (int ti = 0; ti < count_test_block; ++ti) {
for (int ni = 0; ni < txfm_size_; ++ni) {
input[ni] = rnd.Rand8() - rnd.Rand8();
ref_input[ni] = static_cast<double>(input[ni]);
}
fwd_txfm_(input, output);
fwd_txfm_ref_(ref_input, ref_output, txfm_size_);
for (int ni = 0; ni < txfm_size_; ++ni) {
EXPECT_LE(
abs(output[ni] - static_cast<tran_low_t>(round(ref_output[ni]))),
max_error_);
}
}
delete[] input;
delete[] output;
delete[] ref_input;
delete[] ref_output;
}
double max_error_;
int txfm_size_;
IdctFunc fwd_txfm_;
IdctFuncRef fwd_txfm_ref_;
};
typedef std::tr1::tuple<IdctFunc, IdctFuncRef, int, int> IdctParam;
class Vp10InvTxfm
: public TransTestBase,
public ::testing::TestWithParam<IdctParam> {
public:
virtual void SetUp() {
fwd_txfm_ = GET_PARAM(0);
fwd_txfm_ref_ = GET_PARAM(1);
txfm_size_ = GET_PARAM(2);
max_error_ = GET_PARAM(3);
}
virtual void TearDown() {}
};
TEST_P(Vp10InvTxfm, RunInvAccuracyCheck) {
RunInvAccuracyCheck();
}
INSTANTIATE_TEST_CASE_P(
C, Vp10InvTxfm,
::testing::Values(
IdctParam(&vp10_idct4_c, &reference_idct_1d, 4, 1),
IdctParam(&vp10_idct8_c, &reference_idct_1d, 8, 2),
IdctParam(&vp10_idct16_c, &reference_idct_1d, 16, 4),
IdctParam(&vp10_idct32_c, &reference_idct_1d, 32, 6))
);
typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
typedef std::tr1::tuple<FwdTxfmFunc,
InvTxfmFunc,
InvTxfmFunc,
TX_SIZE, int> PartialInvTxfmParam;
const int kMaxNumCoeffs = 1024;
class Vp10PartialIDctTest
: public ::testing::TestWithParam<PartialInvTxfmParam> {
public:
virtual ~Vp10PartialIDctTest() {}
virtual void SetUp() {
ftxfm_ = GET_PARAM(0);
full_itxfm_ = GET_PARAM(1);
partial_itxfm_ = GET_PARAM(2);
tx_size_ = GET_PARAM(3);
last_nonzero_ = GET_PARAM(4);
}
virtual void TearDown() { libvpx_test::ClearSystemState(); }
protected:
int last_nonzero_;
TX_SIZE tx_size_;
FwdTxfmFunc ftxfm_;
InvTxfmFunc full_itxfm_;
InvTxfmFunc partial_itxfm_;
};
TEST_P(Vp10PartialIDctTest, RunQuantCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
int size;
switch (tx_size_) {
case TX_4X4:
size = 4;
break;
case TX_8X8:
size = 8;
break;
case TX_16X16:
size = 16;
break;
case TX_32X32:
size = 32;
break;
default:
FAIL() << "Wrong Size!";
break;
}
DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]);
DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]);
DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]);
DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]);
const int count_test_block = 1000;
const int block_size = size * size;
DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxNumCoeffs]);
DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kMaxNumCoeffs]);
int max_error = 0;
for (int i = 0; i < count_test_block; ++i) {
// clear out destination buffer
memset(dst1, 0, sizeof(*dst1) * block_size);
memset(dst2, 0, sizeof(*dst2) * block_size);
memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size);
memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size);
ACMRandom rnd(ACMRandom::DeterministicSeed());
for (int i = 0; i < count_test_block; ++i) {
// Initialize a test block with input range [-255, 255].
if (i == 0) {
for (int j = 0; j < block_size; ++j)
input_extreme_block[j] = 255;
} else if (i == 1) {
for (int j = 0; j < block_size; ++j)
input_extreme_block[j] = -255;
} else {
for (int j = 0; j < block_size; ++j) {
input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
}
}
ftxfm_(input_extreme_block, output_ref_block, size);
// quantization with maximum allowed step sizes
test_coef_block1[0] = (output_ref_block[0] / 1336) * 1336;
for (int j = 1; j < last_nonzero_; ++j)
test_coef_block1[vp10_default_scan_orders[tx_size_].scan[j]]
= (output_ref_block[j] / 1828) * 1828;
}
ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block1, dst2, size));
for (int j = 0; j < block_size; ++j) {
const int diff = dst1[j] - dst2[j];
const int error = diff * diff;
if (max_error < error)
max_error = error;
}
}
EXPECT_EQ(0, max_error)
<< "Error: partial inverse transform produces different results";
}
TEST_P(Vp10PartialIDctTest, ResultsMatch) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
int size;
switch (tx_size_) {
case TX_4X4:
size = 4;
break;
case TX_8X8:
size = 8;
break;
case TX_16X16:
size = 16;
break;
case TX_32X32:
size = 32;
break;
default:
FAIL() << "Wrong Size!";
break;
}
DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]);
DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]);
DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]);
DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]);
const int count_test_block = 1000;
const int max_coeff = 32766 / 4;
const int block_size = size * size;
int max_error = 0;
for (int i = 0; i < count_test_block; ++i) {
// clear out destination buffer
memset(dst1, 0, sizeof(*dst1) * block_size);
memset(dst2, 0, sizeof(*dst2) * block_size);
memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size);
memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size);
int max_energy_leftover = max_coeff * max_coeff;
for (int j = 0; j < last_nonzero_; ++j) {
int16_t coef = static_cast<int16_t>(sqrt(1.0 * max_energy_leftover) *
(rnd.Rand16() - 32768) / 65536);
max_energy_leftover -= coef * coef;
if (max_energy_leftover < 0) {
max_energy_leftover = 0;
coef = 0;
}
test_coef_block1[vp10_default_scan_orders[tx_size_].scan[j]] = coef;
}
memcpy(test_coef_block2, test_coef_block1,
sizeof(*test_coef_block2) * block_size);
ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block2, dst2, size));
for (int j = 0; j < block_size; ++j) {
const int diff = dst1[j] - dst2[j];
const int error = diff * diff;
if (max_error < error)
max_error = error;
}
}
EXPECT_EQ(0, max_error)
<< "Error: partial inverse transform produces different results";
}
using std::tr1::make_tuple;
INSTANTIATE_TEST_CASE_P(
C, Vp10PartialIDctTest,
::testing::Values(
make_tuple(&vpx_fdct32x32_c,
&vp10_idct32x32_1024_add_c,
&vp10_idct32x32_34_add_c,
TX_32X32, 34),
make_tuple(&vpx_fdct32x32_c,
&vp10_idct32x32_1024_add_c,
&vp10_idct32x32_1_add_c,
TX_32X32, 1),
make_tuple(&vpx_fdct16x16_c,
&vp10_idct16x16_256_add_c,
&vp10_idct16x16_10_add_c,
TX_16X16, 10),
make_tuple(&vpx_fdct16x16_c,
&vp10_idct16x16_256_add_c,
&vp10_idct16x16_1_add_c,
TX_16X16, 1),
make_tuple(&vpx_fdct8x8_c,
&vp10_idct8x8_64_add_c,
&vp10_idct8x8_12_add_c,
TX_8X8, 12),
make_tuple(&vpx_fdct8x8_c,
&vp10_idct8x8_64_add_c,
&vp10_idct8x8_1_add_c,
TX_8X8, 1),
make_tuple(&vpx_fdct4x4_c,
&vp10_idct4x4_16_add_c,
&vp10_idct4x4_1_add_c,
TX_4X4, 1)));
} // namespace
......@@ -695,6 +695,13 @@ DECLARE_ALIGNED(16, static const int16_t, vp10_default_iscan_32x32[1024]) = {
1023,
};
const scan_order vp10_default_scan_orders[TX_SIZES] = {
{default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
{default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
{default_scan_16x16, vp10_default_iscan_16x16, default_scan_16x16_neighbors},
{default_scan_32x32, vp10_default_iscan_32x32, default_scan_32x32_neighbors},
};
const scan_order vp10_scan_orders[TX_SIZES][TX_TYPES] = {
{ // TX_4X4
{default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
......
......@@ -29,6 +29,7 @@ typedef struct {
const int16_t *neighbors;
} scan_order;
extern const scan_order vp10_default_scan_orders[TX_SIZES];
extern const scan_order vp10_scan_orders[TX_SIZES][TX_TYPES];
static INLINE int get_coef_context(const int16_t *neighbors,
......
This diff is collapsed.
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VPX_DSP_INV_TXFM_H_
#define VPX_DSP_INV_TXFM_H_
#include <assert.h>
#include "./vpx_config.h"
#include "vpx_dsp/txfm_common.h"
#include "vpx_ports/mem.h"
#ifdef __cplusplus
extern "C" {
#endif
static INLINE tran_low_t check_range(tran_high_t input) {
#if CONFIG_COEFFICIENT_RANGE_CHECKING
// For valid VP9 input streams, intermediate stage coefficients should always
// stay within the range of a signed 16 bit integer. Coefficients can go out
// of this range for invalid/corrupt VP9 streams. However, strictly checking
// this range for every intermediate coefficient can burdensome for a decoder,
// therefore the following assertion is only enabled when configured with
// --enable-coefficient-range-checking.
assert(INT16_MIN <= input);
assert(input <= INT16_MAX);
#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
return (tran_low_t)input;
}
static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {
tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
return check_range(rv);
}
#if CONFIG_VP9_HIGHBITDEPTH
static INLINE tran_low_t highbd_check_range(tran_high_t input,
int bd) {
#if CONFIG_COEFFICIENT_RANGE_CHECKING
// For valid highbitdepth VP9 streams, intermediate stage coefficients will
// stay within the ranges:
// - 8 bit: signed 16 bit integer
// - 10 bit: signed 18 bit integer
// - 12 bit: signed 20 bit integer
const int32_t int_max = (1 << (7 + bd)) - 1;
const int32_t int_min = -int_max - 1;
assert(int_min <= input);
assert(input <= int_max);
(void) int_min;
#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
(void) bd;
return (tran_low_t)input;
}
static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input,
int bd) {
tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
return highbd_check_range(rv, bd);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
#if CONFIG_EMULATE_HARDWARE
// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
// non-normative method to handle overflows. A stream that causes
// overflows in the inverse transform is considered invalid in VP9,
// and a hardware implementer is free to choose any reasonable
// method to handle overflows. However to aid in hardware
// verification they can use a specific implementation of the
// WRAPLOW() macro below that is identical to their intended
// hardware implementation (and also use configure options to trigger
// the C-implementation of the transform).
//
// The particular WRAPLOW implementation below performs strict
// overflow wrapping to match common hardware implementations.
// bd of 8 uses trans_low with 16bits, need to remove 16bits
// bd of 10 uses trans_low with 18bits, need to remove 14bits
// bd of 12 uses trans_low with 20bits, need to remove 12bits
// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
#else
#define WRAPLOW(x, bd) ((int32_t)(x))
#endif // CONFIG_EMULATE_HARDWARE
void vp10_idct4_c(const tran_low_t *input, tran_low_t *output);
void vp10_idct8_c(const tran_low_t *input, tran_low_t *output);
void vp10_idct16_c(const tran_low_t *input, tran_low_t *output);
void vp10_idct32_c(const tran_low_t *input, tran_low_t *output);
void vp10_iadst4_c(const tran_low_t *input, tran_low_t *output);
void vp10_iadst8_c(const tran_low_t *input, tran_low_t *output);
void vp10_iadst16_c(const tran_low_t *input, tran_low_t *output);
#if CONFIG_VP9_HIGHBITDEPTH
void vp10_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd);
void vp10_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd);
void vp10_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd);
void vp10_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd);
void vp10_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd);
void vp10_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd);
static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
int bd) {
trans = WRAPLOW(trans, bd);
return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
}
#endif
static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
trans = WRAPLOW(trans, 8);
return clip_pixel(WRAPLOW(dest + trans, 8));
}
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VPX_DSP_INV_TXFM_H_
......@@ -289,6 +289,188 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp10_fwht4x4 msa/, "$mmx_x86inc";
}
# Inverse transform
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# Note as optimized versions of these functions are added we need to add a check to ensure
# that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
add_proto qw/void vp10_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp10_idct4x4_1_add/;
add_proto qw/void vp10_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp10_idct4x4_16_add/;
add_proto qw/void vp10_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp10_idct8x8_1_add/;
add_proto qw/void vp10_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp10_idct8x8_64_add/;
add_proto qw/void vp10_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp10_idct8x8_12_add/;
add_proto qw/void vp10_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp10_idct16x16_1_add/;
add_proto qw/void vp10_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp10_idct16x16_256_add/;
add_proto qw/void vp10_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp10_idct16x16_10_add/;
add_proto qw/void vp10_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp10_idct32x32_1024_add/;
add_proto qw/void vp10_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp10_idct32x32_34_add/;
add_proto qw/void vp10_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp10_idct32x32_1_add/;
add_proto qw/void vp10_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp10_iwht4x4_1_add/;
add_proto qw/void vp10_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp10_iwht4x4_16_add/;
add_proto qw/void vp10_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp10_highbd_idct4x4_1_add/;
add_proto qw/void vp10_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp10_highbd_idct8x8_1_add/;
add_proto qw/void vp10_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp10_highbd_idct16x16_1_add/;
add_proto qw/void vp10_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp10_highbd_idct32x32_1024_add/;
add_proto qw/void vp10_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp10_highbd_idct32x32_34_add/;
add_proto qw/void vp10_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp10_highbd_idct32x32_1_add/;
add_proto qw/void vp10_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp10_highbd_iwht4x4_1_add/;
add_proto qw/void vp10_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp10_highbd_iwht4x4_16_add/;
# Force C versions if CONFIG_EMULATE_HARDWARE is 1
if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
add_proto qw/void vp10_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp10_highbd_idct4x4_16_add/;
add_proto qw/void vp10_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp10_highbd_idct8x8_64_add/;
add_proto qw/void vp10_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp10_highbd_idct8x8_10_add/;
add_proto qw/void vp10_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp10_highbd_idct16x16_256_add/;
add_proto qw/void vp10_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp10_highbd_idct16x16_10_add/;
} else {
add_proto qw/void vp10_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp10_highbd_idct4x4_16_add sse2/;
add_proto qw/void vp10_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp10_highbd_idct8x8_64_add sse2/;
add_proto qw/void vp10_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp10_highbd_idct8x8_10_add sse2/;
add_proto qw/void vp10_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp10_highbd_idct16x16_256_add sse2/;
add_proto qw/void vp10_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp10_highbd_idct16x16_10_add sse2/;
} # CONFIG_EMULATE_HARDWARE
} else {
# Force C versions if CONFIG_EMULATE_HARDWARE is 1
if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
add_proto qw/void vp10_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp10_idct4x4_1_add/;
add_proto qw/void vp10_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp10_idct4x4_16_add/;
add_proto qw/void vp10_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp10_idct8x8_1_add/;
add_proto qw/void vp10_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp10_idct8x8_64_add/;
add_proto qw/void vp10_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp10_idct8x8_12_add/;
add_proto qw/void vp10_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp10_idct16x16_1_add/;
add_proto qw/void vp10_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp10_idct16x16_256_add/;
add_proto qw/void vp10_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp10_idct16x16_10_add/;
add_proto qw/void vp10_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp10_idct32x32_1024_add/;
add_proto qw/void vp10_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp10_idct32x32_34_add/;
add_proto qw/void vp10_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp10_idct32x32_1_add/;
add_proto qw/void vp10_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp10_iwht4x4_1_add/;
add_proto qw/void vp10_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp10_iwht4x4_16_add/;
} else {
add_proto qw/void vp10_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp10_idct4x4_1_add sse2/;
add_proto qw/void vp10_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp10_idct4x4_16_add sse2/;
add_proto qw/void vp10_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp10_idct8x8_1_add sse2/;