Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Guillaume Martres
aom-rav1e
Commits
600a3860
Commit
600a3860
authored
Oct 24, 2013
by
Dmitry Kovalev
Browse files
Making input pointer constant for all fdct/fht functions.
Change-Id: I78f7012f967a777ddd39bae6671eb501df6bbfe8
parent
1dcf0940
Changes
8
Hide whitespace changes
Inline
Side-by-side
test/dct16x16_test.cc
View file @
600a3860
...
...
@@ -257,17 +257,18 @@ void reference_16x16_dct_2d(int16_t input[256], double output[256]) {
}
}
typedef
void
(
*
fdct_t
)(
int16_t
*
in
,
int16_t
*
out
,
int
stride
);
typedef
void
(
*
idct_t
)(
const
int16_t
*
in
,
uint8_t
*
dst
,
int
stride
);
typedef
void
(
*
fht_t
)
(
int16_t
*
in
,
int16_t
*
out
,
int
stride
,
int
tx_type
);
typedef
void
(
*
iht_t
)
(
const
int16_t
*
in
,
uint8_t
*
dst
,
int
stride
,
typedef
void
(
*
fdct_t
)(
const
int16_t
*
in
,
int16_t
*
out
,
int
stride
);
typedef
void
(
*
idct_t
)(
const
int16_t
*
in
,
uint8_t
*
out
,
int
stride
);
typedef
void
(
*
fht_t
)
(
const
int16_t
*
in
,
int16_t
*
out
,
int
stride
,
int
tx_type
);
typedef
void
(
*
iht_t
)
(
const
int16_t
*
in
,
uint8_t
*
out
,
int
stride
,
int
tx_type
);
void
fdct16x16_ref
(
int16_t
*
in
,
int16_t
*
out
,
int
stride
,
int
tx_type
)
{
void
fdct16x16_ref
(
const
int16_t
*
in
,
int16_t
*
out
,
int
stride
,
int
tx_type
)
{
vp9_fdct16x16_c
(
in
,
out
,
stride
);
}
void
fht16x16_ref
(
int16_t
*
in
,
int16_t
*
out
,
int
stride
,
int
tx_type
)
{
void
fht16x16_ref
(
const
int16_t
*
in
,
int16_t
*
out
,
int
stride
,
int
tx_type
)
{
vp9_short_fht16x16_c
(
in
,
out
,
stride
,
tx_type
);
}
...
...
test/dct32x32_test.cc
View file @
600a3860
...
...
@@ -74,8 +74,8 @@ void reference_32x32_dct_2d(const int16_t input[kNumCoeffs],
}
}
typedef
void
(
*
fwd_txfm_t
)(
int16_t
*
in
,
int16_t
*
out
,
int
stride
);
typedef
void
(
*
inv_txfm_t
)(
const
int16_t
*
in
,
uint8_t
*
ds
t
,
int
stride
);
typedef
void
(
*
fwd_txfm_t
)(
const
int16_t
*
in
,
int16_t
*
out
,
int
stride
);
typedef
void
(
*
inv_txfm_t
)(
const
int16_t
*
in
,
uint8_t
*
ou
t
,
int
stride
);
class
Trans32x32Test
:
public
PARAMS
(
fwd_txfm_t
,
inv_txfm_t
,
int
)
{
public:
...
...
test/fdct8x8_test.cc
View file @
600a3860
...
...
@@ -28,17 +28,18 @@ void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *output, int pitch);
using
libvpx_test
::
ACMRandom
;
namespace
{
typedef
void
(
*
fdct_t
)(
int16_t
*
in
,
int16_t
*
out
,
int
stride
);
typedef
void
(
*
idct_t
)(
const
int16_t
*
in
,
uint8_t
*
dst
,
int
stride
);
typedef
void
(
*
fht_t
)
(
int16_t
*
in
,
int16_t
*
out
,
int
stride
,
int
tx_type
);
typedef
void
(
*
iht_t
)
(
const
int16_t
*
in
,
uint8_t
*
dst
,
int
stride
,
int
tx_type
);
void
fdct8x8_ref
(
int16_t
*
in
,
int16_t
*
out
,
int
stride
,
int
tx_type
)
{
typedef
void
(
*
fdct_t
)(
const
int16_t
*
in
,
int16_t
*
out
,
int
stride
);
typedef
void
(
*
idct_t
)(
const
int16_t
*
in
,
uint8_t
*
out
,
int
stride
);
typedef
void
(
*
fht_t
)
(
const
int16_t
*
in
,
int16_t
*
out
,
int
stride
,
int
tx_type
);
typedef
void
(
*
iht_t
)
(
const
int16_t
*
in
,
uint8_t
*
out
,
int
stride
,
int
tx_type
);
void
fdct8x8_ref
(
const
int16_t
*
in
,
int16_t
*
out
,
int
stride
,
int
tx_type
)
{
vp9_fdct8x8_c
(
in
,
out
,
stride
);
}
void
fht8x8_ref
(
int16_t
*
in
,
int16_t
*
out
,
int
stride
,
int
tx_type
)
{
void
fht8x8_ref
(
const
int16_t
*
in
,
int16_t
*
out
,
int
stride
,
int
tx_type
)
{
vp9_short_fht8x8_c
(
in
,
out
,
stride
,
tx_type
);
}
...
...
vp9/common/vp9_rtcd_defs.sh
View file @
600a3860
...
...
@@ -686,31 +686,31 @@ if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then
fi
# fdct functions
prototype void vp9_short_fht4x4
"int16_t *
I
nput
Data
, int16_t *
O
utput
Data
, int
pitch
, int tx_type"
prototype void vp9_short_fht4x4
"
const
int16_t *
i
nput, int16_t *
o
utput, int
stride
, int tx_type"
specialize vp9_short_fht4x4 sse2
prototype void vp9_short_fht8x8
"int16_t *
I
nput
Data
, int16_t *
O
utput
Data
, int
pitch
, int tx_type"
prototype void vp9_short_fht8x8
"
const
int16_t *
i
nput, int16_t *
o
utput, int
stride
, int tx_type"
specialize vp9_short_fht8x8 sse2
prototype void vp9_short_fht16x16
"int16_t *
I
nput
Data
, int16_t *
O
utput
Data
, int
pitch
, int tx_type"
prototype void vp9_short_fht16x16
"
const
int16_t *
i
nput, int16_t *
o
utput, int
stride
, int tx_type"
specialize vp9_short_fht16x16 sse2
prototype void vp9_fwht4x4
"int16_t *input, int16_t *output, int stride"
prototype void vp9_fwht4x4
"
const
int16_t *input, int16_t *output, int stride"
specialize vp9_fwht4x4
prototype void vp9_fdct4x4
"int16_t *input, int16_t *output, int stride"
prototype void vp9_fdct4x4
"
const
int16_t *input, int16_t *output, int stride"
specialize vp9_fdct4x4 sse2
prototype void vp9_fdct8x8
"int16_t *input, int16_t *output, int stride"
prototype void vp9_fdct8x8
"
const
int16_t *input, int16_t *output, int stride"
specialize vp9_fdct8x8 sse2
prototype void vp9_fdct16x16
"int16_t *input, int16_t *output, int stride"
prototype void vp9_fdct16x16
"
const
int16_t *input, int16_t *output, int stride"
specialize vp9_fdct16x16 sse2
prototype void vp9_fdct32x32
"int16_t *input, int16_t *output, int stride"
prototype void vp9_fdct32x32
"
const
int16_t *input, int16_t *output, int stride"
specialize vp9_fdct32x32 sse2
prototype void vp9_fdct32x32_rd
"int16_t *input, int16_t *output, int stride"
prototype void vp9_fdct32x32_rd
"
const
int16_t *input, int16_t *output, int stride"
specialize vp9_fdct32x32_rd sse2
#
...
...
vp9/encoder/vp9_block.h
View file @
600a3860
...
...
@@ -173,7 +173,7 @@ struct macroblock {
BLOCK_SIZE
sb_partitioning
[
4
];
BLOCK_SIZE
sb64_partitioning
;
void
(
*
fwd_txm4x4
)(
int16_t
*
input
,
int16_t
*
output
,
int
pitch
);
void
(
*
fwd_txm4x4
)(
const
int16_t
*
input
,
int16_t
*
output
,
int
stride
);
};
// TODO(jingning): the variables used here are little complicated. need further
...
...
vp9/encoder/vp9_dct.c
View file @
600a3860
...
...
@@ -36,7 +36,7 @@ static void fdct4(const int16_t *input, int16_t *output) {
output
[
3
]
=
dct_const_round_shift
(
temp2
);
}
void
vp9_fdct4x4_c
(
int16_t
*
input
,
int16_t
*
output
,
int
stride
)
{
void
vp9_fdct4x4_c
(
const
int16_t
*
input
,
int16_t
*
output
,
int
stride
)
{
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
// the results. In the second one, we transform the rows. To achieve that,
...
...
@@ -46,7 +46,7 @@ void vp9_fdct4x4_c(int16_t *input, int16_t *output, int stride) {
int
pass
;
// We need an intermediate buffer between passes.
int16_t
intermediate
[
4
*
4
];
int16_t
*
in
=
input
;
const
int16_t
*
in
=
input
;
int16_t
*
out
=
intermediate
;
// Do the two transform/transpose passes
for
(
pass
=
0
;
pass
<
2
;
++
pass
)
{
...
...
@@ -148,8 +148,8 @@ static const transform_2d FHT_4[] = {
{
fadst4
,
fadst4
}
// ADST_ADST = 3
};
void
vp9_short_fht4x4_c
(
int16_t
*
input
,
int16_t
*
output
,
int
pitch
,
TX_TYPE
tx_type
)
{
void
vp9_short_fht4x4_c
(
const
int16_t
*
input
,
int16_t
*
output
,
int
stride
,
TX_TYPE
tx_type
)
{
int16_t
out
[
4
*
4
];
int16_t
*
outptr
=
&
out
[
0
];
int
i
,
j
;
...
...
@@ -159,7 +159,7 @@ void vp9_short_fht4x4_c(int16_t *input, int16_t *output,
// Columns
for
(
i
=
0
;
i
<
4
;
++
i
)
{
for
(
j
=
0
;
j
<
4
;
++
j
)
temp_in
[
j
]
=
input
[
j
*
pitch
+
i
]
*
16
;
temp_in
[
j
]
=
input
[
j
*
stride
+
i
]
*
16
;
if
(
i
==
0
&&
temp_in
[
0
])
temp_in
[
0
]
+=
1
;
ht
.
cols
(
temp_in
,
temp_out
);
...
...
@@ -229,7 +229,7 @@ static void fdct8(const int16_t *input, int16_t *output) {
output
[
7
]
=
dct_const_round_shift
(
t3
);
}
void
vp9_fdct8x8_c
(
int16_t
*
input
,
int16_t
*
final_output
,
int
stride
)
{
void
vp9_fdct8x8_c
(
const
int16_t
*
input
,
int16_t
*
final_output
,
int
stride
)
{
int
i
,
j
;
int16_t
intermediate
[
64
];
...
...
@@ -300,7 +300,7 @@ void vp9_fdct8x8_c(int16_t *input, int16_t *final_output, int stride) {
}
}
void
vp9_fdct16x16_c
(
int16_t
*
input
,
int16_t
*
output
,
int
stride
)
{
void
vp9_fdct16x16_c
(
const
int16_t
*
input
,
int16_t
*
output
,
int
stride
)
{
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
// the results. In the second one, we transform the rows. To achieve that,
...
...
@@ -310,7 +310,7 @@ void vp9_fdct16x16_c(int16_t *input, int16_t *output, int stride) {
int
pass
;
// We need an intermediate buffer between passes.
int16_t
intermediate
[
256
];
int16_t
*
in
=
input
;
const
int16_t
*
in
=
input
;
int16_t
*
out
=
intermediate
;
// Do the two transform/transpose passes
for
(
pass
=
0
;
pass
<
2
;
++
pass
)
{
...
...
@@ -556,8 +556,8 @@ static const transform_2d FHT_8[] = {
{
fadst8
,
fadst8
}
// ADST_ADST = 3
};
void
vp9_short_fht8x8_c
(
int16_t
*
input
,
int16_t
*
output
,
int
pitch
,
TX_TYPE
tx_type
)
{
void
vp9_short_fht8x8_c
(
const
int16_t
*
input
,
int16_t
*
output
,
int
stride
,
TX_TYPE
tx_type
)
{
int16_t
out
[
64
];
int16_t
*
outptr
=
&
out
[
0
];
int
i
,
j
;
...
...
@@ -567,7 +567,7 @@ void vp9_short_fht8x8_c(int16_t *input, int16_t *output,
// Columns
for
(
i
=
0
;
i
<
8
;
++
i
)
{
for
(
j
=
0
;
j
<
8
;
++
j
)
temp_in
[
j
]
=
input
[
j
*
pitch
+
i
]
*
4
;
temp_in
[
j
]
=
input
[
j
*
stride
+
i
]
*
4
;
ht
.
cols
(
temp_in
,
temp_out
);
for
(
j
=
0
;
j
<
8
;
++
j
)
outptr
[
j
*
8
+
i
]
=
temp_out
[
j
];
...
...
@@ -585,10 +585,10 @@ void vp9_short_fht8x8_c(int16_t *input, int16_t *output,
/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
pixel. */
void
vp9_fwht4x4_c
(
int16_t
*
input
,
int16_t
*
output
,
int
stride
)
{
void
vp9_fwht4x4_c
(
const
int16_t
*
input
,
int16_t
*
output
,
int
stride
)
{
int
i
;
int
a1
,
b1
,
c1
,
d1
,
e1
;
int16_t
*
ip
=
input
;
const
int16_t
*
ip
=
input
;
int16_t
*
op
=
output
;
for
(
i
=
0
;
i
<
4
;
i
++
)
{
...
...
@@ -949,8 +949,8 @@ static const transform_2d FHT_16[] = {
{
fadst16
,
fadst16
}
// ADST_ADST = 3
};
void
vp9_short_fht16x16_c
(
int16_t
*
input
,
int16_t
*
output
,
int
pitch
,
TX_TYPE
tx_type
)
{
void
vp9_short_fht16x16_c
(
const
int16_t
*
input
,
int16_t
*
output
,
int
stride
,
TX_TYPE
tx_type
)
{
int16_t
out
[
256
];
int16_t
*
outptr
=
&
out
[
0
];
int
i
,
j
;
...
...
@@ -960,7 +960,7 @@ void vp9_short_fht16x16_c(int16_t *input, int16_t *output,
// Columns
for
(
i
=
0
;
i
<
16
;
++
i
)
{
for
(
j
=
0
;
j
<
16
;
++
j
)
temp_in
[
j
]
=
input
[
j
*
pitch
+
i
]
*
4
;
temp_in
[
j
]
=
input
[
j
*
stride
+
i
]
*
4
;
ht
.
cols
(
temp_in
,
temp_out
);
for
(
j
=
0
;
j
<
16
;
++
j
)
outptr
[
j
*
16
+
i
]
=
(
temp_out
[
j
]
+
1
+
(
temp_out
[
j
]
<
0
))
>>
2
;
...
...
@@ -1311,7 +1311,7 @@ static void dct32_1d(const int *input, int *output, int round) {
output
[
31
]
=
dct_32_round
(
step
[
31
]
*
cospi_31_64
+
step
[
16
]
*
-
cospi_1_64
);
}
void
vp9_fdct32x32_c
(
int16_t
*
input
,
int16_t
*
out
,
int
stride
)
{
void
vp9_fdct32x32_c
(
const
int16_t
*
input
,
int16_t
*
out
,
int
stride
)
{
int
i
,
j
;
int
output
[
32
*
32
];
...
...
@@ -1339,7 +1339,7 @@ void vp9_fdct32x32_c(int16_t *input, int16_t *out, int stride) {
// Note that although we use dct_32_round in dct32_1d computation flow,
// this 2d fdct32x32 for rate-distortion optimization loop is operating
// within 16 bits precision.
void
vp9_fdct32x32_rd_c
(
int16_t
*
input
,
int16_t
*
out
,
int
stride
)
{
void
vp9_fdct32x32_rd_c
(
const
int16_t
*
input
,
int16_t
*
out
,
int
stride
)
{
int
i
,
j
;
int
output
[
32
*
32
];
...
...
vp9/encoder/x86/vp9_dct32x32_sse2.c
View file @
600a3860
...
...
@@ -29,7 +29,7 @@ static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
}
#endif
void
FDCT32x32_2D
(
int16_t
*
input
,
void
FDCT32x32_2D
(
const
int16_t
*
input
,
int16_t
*
output_org
,
int
stride
)
{
// Calculate pre-multiplied strides
const
int
str1
=
stride
;
...
...
@@ -93,13 +93,13 @@ void FDCT32x32_2D(int16_t *input,
// Note: even though all the loads below are aligned, using the aligned
// intrinsic make the code slightly slower.
if
(
0
==
pass
)
{
int16_t
*
in
=
&
input
[
column_start
];
const
int16_t
*
in
=
&
input
[
column_start
];
// step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2;
// Note: the next four blocks could be in a loop. That would help the
// instruction cache but is actually slower.
{
int16_t
*
ina
=
in
+
0
*
str1
;
int16_t
*
inb
=
in
+
31
*
str1
;
const
int16_t
*
ina
=
in
+
0
*
str1
;
const
int16_t
*
inb
=
in
+
31
*
str1
;
__m128i
*
step1a
=
&
step1
[
0
];
__m128i
*
step1b
=
&
step1
[
31
];
const
__m128i
ina0
=
_mm_loadu_si128
((
const
__m128i
*
)(
ina
));
...
...
@@ -128,8 +128,8 @@ void FDCT32x32_2D(int16_t *input,
step1b
[
-
0
]
=
_mm_slli_epi16
(
step1b
[
-
0
],
2
);
}
{
int16_t
*
ina
=
in
+
4
*
str1
;
int16_t
*
inb
=
in
+
27
*
str1
;
const
int16_t
*
ina
=
in
+
4
*
str1
;
const
int16_t
*
inb
=
in
+
27
*
str1
;
__m128i
*
step1a
=
&
step1
[
4
];
__m128i
*
step1b
=
&
step1
[
27
];
const
__m128i
ina0
=
_mm_loadu_si128
((
const
__m128i
*
)(
ina
));
...
...
@@ -158,8 +158,8 @@ void FDCT32x32_2D(int16_t *input,
step1b
[
-
0
]
=
_mm_slli_epi16
(
step1b
[
-
0
],
2
);
}
{
int16_t
*
ina
=
in
+
8
*
str1
;
int16_t
*
inb
=
in
+
23
*
str1
;
const
int16_t
*
ina
=
in
+
8
*
str1
;
const
int16_t
*
inb
=
in
+
23
*
str1
;
__m128i
*
step1a
=
&
step1
[
8
];
__m128i
*
step1b
=
&
step1
[
23
];
const
__m128i
ina0
=
_mm_loadu_si128
((
const
__m128i
*
)(
ina
));
...
...
@@ -188,8 +188,8 @@ void FDCT32x32_2D(int16_t *input,
step1b
[
-
0
]
=
_mm_slli_epi16
(
step1b
[
-
0
],
2
);
}
{
int16_t
*
ina
=
in
+
12
*
str1
;
int16_t
*
inb
=
in
+
19
*
str1
;
const
int16_t
*
ina
=
in
+
12
*
str1
;
const
int16_t
*
inb
=
in
+
19
*
str1
;
__m128i
*
step1a
=
&
step1
[
12
];
__m128i
*
step1b
=
&
step1
[
19
];
const
__m128i
ina0
=
_mm_loadu_si128
((
const
__m128i
*
)(
ina
));
...
...
vp9/encoder/x86/vp9_dct_sse2.c
View file @
600a3860
...
...
@@ -12,7 +12,7 @@
#include
"vp9/common/vp9_idct.h"
// for cospi constants
#include
"vpx_ports/mem.h"
void
vp9_fdct4x4_sse2
(
int16_t
*
input
,
int16_t
*
output
,
int
stride
)
{
void
vp9_fdct4x4_sse2
(
const
int16_t
*
input
,
int16_t
*
output
,
int
stride
)
{
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
// the results. In the second one, we transform the rows. To achieve that,
...
...
@@ -111,7 +111,8 @@ void vp9_fdct4x4_sse2(int16_t *input, int16_t *output, int stride) {
}
}
static
INLINE
void
load_buffer_4x4
(
int16_t
*
input
,
__m128i
*
in
,
int
stride
)
{
static
INLINE
void
load_buffer_4x4
(
const
int16_t
*
input
,
__m128i
*
in
,
int
stride
)
{
const
__m128i
k__nonzero_bias_a
=
_mm_setr_epi16
(
0
,
1
,
1
,
1
,
1
,
1
,
1
,
1
);
const
__m128i
k__nonzero_bias_b
=
_mm_setr_epi16
(
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
);
__m128i
mask
;
...
...
@@ -242,7 +243,7 @@ void fadst4_1d_sse2(__m128i *in) {
transpose_4x4
(
in
);
}
void
vp9_short_fht4x4_sse2
(
int16_t
*
input
,
int16_t
*
output
,
void
vp9_short_fht4x4_sse2
(
const
int16_t
*
input
,
int16_t
*
output
,
int
stride
,
int
tx_type
)
{
__m128i
in
[
4
];
load_buffer_4x4
(
input
,
in
,
stride
);
...
...
@@ -270,7 +271,7 @@ void vp9_short_fht4x4_sse2(int16_t *input, int16_t *output,
write_buffer_4x4
(
output
,
in
);
}
void
vp9_fdct8x8_sse2
(
int16_t
*
input
,
int16_t
*
output
,
int
stride
)
{
void
vp9_fdct8x8_sse2
(
const
int16_t
*
input
,
int16_t
*
output
,
int
stride
)
{
int
pass
;
// Constants
// When we use them, in one case, they are all the same. In all others
...
...
@@ -527,15 +528,16 @@ void vp9_fdct8x8_sse2(int16_t *input, int16_t *output, int stride) {
}
// load 8x8 array
static
INLINE
void
load_buffer_8x8
(
int16_t
*
input
,
__m128i
*
in
,
int
stride
)
{
in
[
0
]
=
_mm_load_si128
((
__m128i
*
)(
input
+
0
*
stride
));
in
[
1
]
=
_mm_load_si128
((
__m128i
*
)(
input
+
1
*
stride
));
in
[
2
]
=
_mm_load_si128
((
__m128i
*
)(
input
+
2
*
stride
));
in
[
3
]
=
_mm_load_si128
((
__m128i
*
)(
input
+
3
*
stride
));
in
[
4
]
=
_mm_load_si128
((
__m128i
*
)(
input
+
4
*
stride
));
in
[
5
]
=
_mm_load_si128
((
__m128i
*
)(
input
+
5
*
stride
));
in
[
6
]
=
_mm_load_si128
((
__m128i
*
)(
input
+
6
*
stride
));
in
[
7
]
=
_mm_load_si128
((
__m128i
*
)(
input
+
7
*
stride
));
static
INLINE
void
load_buffer_8x8
(
const
int16_t
*
input
,
__m128i
*
in
,
int
stride
)
{
in
[
0
]
=
_mm_load_si128
((
const
__m128i
*
)(
input
+
0
*
stride
));
in
[
1
]
=
_mm_load_si128
((
const
__m128i
*
)(
input
+
1
*
stride
));
in
[
2
]
=
_mm_load_si128
((
const
__m128i
*
)(
input
+
2
*
stride
));
in
[
3
]
=
_mm_load_si128
((
const
__m128i
*
)(
input
+
3
*
stride
));
in
[
4
]
=
_mm_load_si128
((
const
__m128i
*
)(
input
+
4
*
stride
));
in
[
5
]
=
_mm_load_si128
((
const
__m128i
*
)(
input
+
5
*
stride
));
in
[
6
]
=
_mm_load_si128
((
const
__m128i
*
)(
input
+
6
*
stride
));
in
[
7
]
=
_mm_load_si128
((
const
__m128i
*
)(
input
+
7
*
stride
));
in
[
0
]
=
_mm_slli_epi16
(
in
[
0
],
2
);
in
[
1
]
=
_mm_slli_epi16
(
in
[
1
],
2
);
...
...
@@ -1025,7 +1027,7 @@ void fadst8_1d_sse2(__m128i *in) {
array_transpose_8x8
(
in
,
in
);
}
void
vp9_short_fht8x8_sse2
(
int16_t
*
input
,
int16_t
*
output
,
void
vp9_short_fht8x8_sse2
(
const
int16_t
*
input
,
int16_t
*
output
,
int
stride
,
int
tx_type
)
{
__m128i
in
[
8
];
load_buffer_8x8
(
input
,
in
,
stride
);
...
...
@@ -1054,7 +1056,7 @@ void vp9_short_fht8x8_sse2(int16_t *input, int16_t *output,
write_buffer_8x8
(
output
,
in
,
8
);
}
void
vp9_fdct16x16_sse2
(
int16_t
*
input
,
int16_t
*
output
,
int
stride
)
{
void
vp9_fdct16x16_sse2
(
const
int16_t
*
input
,
int16_t
*
output
,
int
stride
)
{
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
// the results. In the second one, we transform the rows. To achieve that,
...
...
@@ -1064,7 +1066,7 @@ void vp9_fdct16x16_sse2(int16_t *input, int16_t *output, int stride) {
int
pass
;
// We need an intermediate buffer between passes.
DECLARE_ALIGNED_ARRAY
(
16
,
int16_t
,
intermediate
,
256
);
int16_t
*
in
=
input
;
const
int16_t
*
in
=
input
;
int16_t
*
out
=
intermediate
;
// Constants
// When we use them, in one case, they are all the same. In all others
...
...
@@ -1679,7 +1681,7 @@ void vp9_fdct16x16_sse2(int16_t *input, int16_t *output, int stride) {
}
}
static
INLINE
void
load_buffer_16x16
(
int16_t
*
input
,
__m128i
*
in0
,
static
INLINE
void
load_buffer_16x16
(
const
int16_t
*
input
,
__m128i
*
in0
,
__m128i
*
in1
,
int
stride
)
{
// load first 8 columns
load_buffer_8x8
(
input
,
in0
,
stride
);
...
...
@@ -2531,7 +2533,7 @@ void fadst16_1d_sse2(__m128i *in0, __m128i *in1) {
array_transpose_16x16
(
in0
,
in1
);
}
void
vp9_short_fht16x16_sse2
(
int16_t
*
input
,
int16_t
*
output
,
void
vp9_short_fht16x16_sse2
(
const
int16_t
*
input
,
int16_t
*
output
,
int
stride
,
int
tx_type
)
{
__m128i
in0
[
16
],
in1
[
16
];
load_buffer_16x16
(
input
,
in0
,
in1
,
stride
);
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment