Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
dac5891a
Commit
dac5891a
authored
Jul 11, 2013
by
Jingning Han
Committed by
Gerrit Code Review
Jul 11, 2013
Browse files
Merge "SSE2 4x4 invserse ADST/DCT transform"
parents
b55ecafd
49b63020
Changes
2
Hide whitespace changes
Inline
Side-by-side
vp9/common/vp9_rtcd_defs.sh
View file @
dac5891a
...
...
@@ -325,7 +325,7 @@ prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int de
specialize vp9_short_idct10_32x32_add
prototype void vp9_short_iht4x4_add
"int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
specialize vp9_short_iht4x4_add
specialize vp9_short_iht4x4_add
sse2
prototype void vp9_short_iht8x8_add
"int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
specialize vp9_short_iht8x8_add
...
...
vp9/common/x86/vp9_idct_intrin_sse2.c
View file @
dac5891a
...
...
@@ -241,6 +241,155 @@ void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {
_mm_storel_epi64
((
__m128i
*
)
output
,
in
);
}
static
INLINE
void
transpose_4x4
(
__m128i
*
res
)
{
const
__m128i
tr0_0
=
_mm_unpacklo_epi16
(
res
[
0
],
res
[
1
]);
const
__m128i
tr0_1
=
_mm_unpacklo_epi16
(
res
[
2
],
res
[
3
]);
res
[
0
]
=
_mm_unpacklo_epi32
(
tr0_0
,
tr0_1
);
res
[
2
]
=
_mm_unpackhi_epi32
(
tr0_0
,
tr0_1
);
res
[
1
]
=
_mm_unpackhi_epi64
(
res
[
0
],
res
[
0
]);
res
[
3
]
=
_mm_unpackhi_epi64
(
res
[
2
],
res
[
2
]);
}
void
idct4_1d_sse2
(
__m128i
*
in
)
{
const
__m128i
k__cospi_p16_p16
=
pair_set_epi16
(
cospi_16_64
,
cospi_16_64
);
const
__m128i
k__cospi_p16_m16
=
pair_set_epi16
(
cospi_16_64
,
-
cospi_16_64
);
const
__m128i
k__cospi_p24_m08
=
pair_set_epi16
(
cospi_24_64
,
-
cospi_8_64
);
const
__m128i
k__cospi_p08_p24
=
pair_set_epi16
(
cospi_8_64
,
cospi_24_64
);
const
__m128i
k__DCT_CONST_ROUNDING
=
_mm_set1_epi32
(
DCT_CONST_ROUNDING
);
__m128i
u
[
8
],
v
[
8
];
transpose_4x4
(
in
);
// stage 1
u
[
0
]
=
_mm_unpacklo_epi16
(
in
[
0
],
in
[
2
]);
u
[
1
]
=
_mm_unpacklo_epi16
(
in
[
1
],
in
[
3
]);
v
[
0
]
=
_mm_madd_epi16
(
u
[
0
],
k__cospi_p16_p16
);
v
[
1
]
=
_mm_madd_epi16
(
u
[
0
],
k__cospi_p16_m16
);
v
[
2
]
=
_mm_madd_epi16
(
u
[
1
],
k__cospi_p24_m08
);
v
[
3
]
=
_mm_madd_epi16
(
u
[
1
],
k__cospi_p08_p24
);
u
[
0
]
=
_mm_add_epi32
(
v
[
0
],
k__DCT_CONST_ROUNDING
);
u
[
1
]
=
_mm_add_epi32
(
v
[
1
],
k__DCT_CONST_ROUNDING
);
u
[
2
]
=
_mm_add_epi32
(
v
[
2
],
k__DCT_CONST_ROUNDING
);
u
[
3
]
=
_mm_add_epi32
(
v
[
3
],
k__DCT_CONST_ROUNDING
);
v
[
0
]
=
_mm_srai_epi32
(
u
[
0
],
DCT_CONST_BITS
);
v
[
1
]
=
_mm_srai_epi32
(
u
[
1
],
DCT_CONST_BITS
);
v
[
2
]
=
_mm_srai_epi32
(
u
[
2
],
DCT_CONST_BITS
);
v
[
3
]
=
_mm_srai_epi32
(
u
[
3
],
DCT_CONST_BITS
);
u
[
0
]
=
_mm_packs_epi32
(
v
[
0
],
v
[
2
]);
u
[
1
]
=
_mm_packs_epi32
(
v
[
1
],
v
[
3
]);
u
[
2
]
=
_mm_unpackhi_epi64
(
u
[
0
],
u
[
0
]);
u
[
3
]
=
_mm_unpackhi_epi64
(
u
[
1
],
u
[
1
]);
// stage 2
in
[
0
]
=
_mm_add_epi16
(
u
[
0
],
u
[
3
]);
in
[
1
]
=
_mm_add_epi16
(
u
[
1
],
u
[
2
]);
in
[
2
]
=
_mm_sub_epi16
(
u
[
1
],
u
[
2
]);
in
[
3
]
=
_mm_sub_epi16
(
u
[
0
],
u
[
3
]);
}
void
iadst4_1d_sse2
(
__m128i
*
in
)
{
const
__m128i
k__sinpi_p01_p04
=
pair_set_epi16
(
sinpi_1_9
,
sinpi_4_9
);
const
__m128i
k__sinpi_p03_p02
=
pair_set_epi16
(
sinpi_3_9
,
sinpi_2_9
);
const
__m128i
k__sinpi_p02_m01
=
pair_set_epi16
(
sinpi_2_9
,
-
sinpi_1_9
);
const
__m128i
k__sinpi_p03_m04
=
pair_set_epi16
(
sinpi_3_9
,
-
sinpi_4_9
);
const
__m128i
k__sinpi_p03_p03
=
_mm_set1_epi16
(
sinpi_3_9
);
const
__m128i
kZero
=
_mm_set1_epi16
(
0
);
const
__m128i
k__DCT_CONST_ROUNDING
=
_mm_set1_epi32
(
DCT_CONST_ROUNDING
);
__m128i
u
[
8
],
v
[
8
],
in7
;
transpose_4x4
(
in
);
in7
=
_mm_add_epi16
(
in
[
0
],
in
[
3
]);
in7
=
_mm_sub_epi16
(
in7
,
in
[
2
]);
u
[
0
]
=
_mm_unpacklo_epi16
(
in
[
0
],
in
[
2
]);
u
[
1
]
=
_mm_unpacklo_epi16
(
in
[
1
],
in
[
3
]);
u
[
2
]
=
_mm_unpacklo_epi16
(
in7
,
kZero
);
u
[
3
]
=
_mm_unpacklo_epi16
(
in
[
1
],
kZero
);
v
[
0
]
=
_mm_madd_epi16
(
u
[
0
],
k__sinpi_p01_p04
);
// s0 + s3
v
[
1
]
=
_mm_madd_epi16
(
u
[
1
],
k__sinpi_p03_p02
);
// s2 + s5
v
[
2
]
=
_mm_madd_epi16
(
u
[
2
],
k__sinpi_p03_p03
);
// x2
v
[
3
]
=
_mm_madd_epi16
(
u
[
0
],
k__sinpi_p02_m01
);
// s1 - s4
v
[
4
]
=
_mm_madd_epi16
(
u
[
1
],
k__sinpi_p03_m04
);
// s2 - s6
v
[
5
]
=
_mm_madd_epi16
(
u
[
3
],
k__sinpi_p03_p03
);
// s2
u
[
0
]
=
_mm_add_epi32
(
v
[
0
],
v
[
1
]);
u
[
1
]
=
_mm_add_epi32
(
v
[
3
],
v
[
4
]);
u
[
2
]
=
v
[
2
];
u
[
3
]
=
_mm_add_epi32
(
u
[
0
],
u
[
1
]);
u
[
4
]
=
_mm_slli_epi32
(
v
[
5
],
2
);
u
[
5
]
=
_mm_add_epi32
(
u
[
3
],
v
[
5
]);
u
[
6
]
=
_mm_sub_epi32
(
u
[
5
],
u
[
4
]);
v
[
0
]
=
_mm_add_epi32
(
u
[
0
],
k__DCT_CONST_ROUNDING
);
v
[
1
]
=
_mm_add_epi32
(
u
[
1
],
k__DCT_CONST_ROUNDING
);
v
[
2
]
=
_mm_add_epi32
(
u
[
2
],
k__DCT_CONST_ROUNDING
);
v
[
3
]
=
_mm_add_epi32
(
u
[
6
],
k__DCT_CONST_ROUNDING
);
u
[
0
]
=
_mm_srai_epi32
(
v
[
0
],
DCT_CONST_BITS
);
u
[
1
]
=
_mm_srai_epi32
(
v
[
1
],
DCT_CONST_BITS
);
u
[
2
]
=
_mm_srai_epi32
(
v
[
2
],
DCT_CONST_BITS
);
u
[
3
]
=
_mm_srai_epi32
(
v
[
3
],
DCT_CONST_BITS
);
in
[
0
]
=
_mm_packs_epi32
(
u
[
0
],
u
[
2
]);
in
[
1
]
=
_mm_packs_epi32
(
u
[
1
],
u
[
3
]);
in
[
2
]
=
_mm_unpackhi_epi64
(
in
[
0
],
in
[
0
]);
in
[
3
]
=
_mm_unpackhi_epi64
(
in
[
1
],
in
[
1
]);
}
void
vp9_short_iht4x4_add_sse2
(
int16_t
*
input
,
uint8_t
*
dest
,
int
stride
,
int
tx_type
)
{
__m128i
in
[
4
];
const
__m128i
zero
=
_mm_setzero_si128
();
const
__m128i
eight
=
_mm_set1_epi16
(
8
);
in
[
0
]
=
_mm_loadl_epi64
((
__m128i
*
)
input
);
in
[
1
]
=
_mm_loadl_epi64
((
__m128i
*
)(
input
+
4
));
in
[
2
]
=
_mm_loadl_epi64
((
__m128i
*
)(
input
+
8
));
in
[
3
]
=
_mm_loadl_epi64
((
__m128i
*
)(
input
+
12
));
switch
(
tx_type
)
{
case
0
:
// DCT_DCT
idct4_1d_sse2
(
in
);
idct4_1d_sse2
(
in
);
break
;
case
1
:
// ADST_DCT
idct4_1d_sse2
(
in
);
iadst4_1d_sse2
(
in
);
break
;
case
2
:
// DCT_ADST
iadst4_1d_sse2
(
in
);
idct4_1d_sse2
(
in
);
break
;
case
3
:
// ADST_ADST
iadst4_1d_sse2
(
in
);
iadst4_1d_sse2
(
in
);
break
;
default:
assert
(
0
);
break
;
}
// Final round and shift
in
[
0
]
=
_mm_add_epi16
(
in
[
0
],
eight
);
in
[
1
]
=
_mm_add_epi16
(
in
[
1
],
eight
);
in
[
2
]
=
_mm_add_epi16
(
in
[
2
],
eight
);
in
[
3
]
=
_mm_add_epi16
(
in
[
3
],
eight
);
in
[
0
]
=
_mm_srai_epi16
(
in
[
0
],
4
);
in
[
1
]
=
_mm_srai_epi16
(
in
[
1
],
4
);
in
[
2
]
=
_mm_srai_epi16
(
in
[
2
],
4
);
in
[
3
]
=
_mm_srai_epi16
(
in
[
3
],
4
);
RECON_AND_STORE4X4
(
dest
,
in
[
0
]);
RECON_AND_STORE4X4
(
dest
,
in
[
1
]);
RECON_AND_STORE4X4
(
dest
,
in
[
2
]);
RECON_AND_STORE4X4
(
dest
,
in
[
3
]);
}
#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3, out4, out5, out6, out7) \
{ \
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment