idct16x16_neon.c 7.54 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
/*
 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

11
#include "vpx_dsp/vpx_dsp_common.h"
12

13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
void vp9_idct16x16_256_add_neon_pass1(const int16_t *input,
                                      int16_t *output,
                                      int output_stride);
void vp9_idct16x16_256_add_neon_pass2(const int16_t *src,
                                      int16_t *output,
                                      int16_t *pass1Output,
                                      int16_t skip_adding,
                                      uint8_t *dest,
                                      int dest_stride);
void vp9_idct16x16_10_add_neon_pass1(const int16_t *input,
                                     int16_t *output,
                                     int output_stride);
void vp9_idct16x16_10_add_neon_pass2(const int16_t *src,
                                     int16_t *output,
                                     int16_t *pass1Output,
                                     int16_t skip_adding,
                                     uint8_t *dest,
                                     int dest_stride);
31

32
#if HAVE_NEON_ASM
33
34
35
/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
extern void vp9_push_neon(int64_t *store);
extern void vp9_pop_neon(int64_t *store);
36
#endif  // HAVE_NEON_ASM
37

38
39
void vp9_idct16x16_256_add_neon(const int16_t *input,
                                uint8_t *dest, int dest_stride) {
40
#if HAVE_NEON_ASM
41
  int64_t store_reg[8];
42
#endif
43
44
45
  int16_t pass1_output[16*16] = {0};
  int16_t row_idct_output[16*16] = {0};

46
#if HAVE_NEON_ASM
47
  // save d8-d15 register values.
48
  vp9_push_neon(store_reg);
49
#endif
50
51
52
53

  /* Parallel idct on the upper 8 rows */
  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
  // stage 6 result in pass1_output.
54
  vp9_idct16x16_256_add_neon_pass1(input, pass1_output, 8);
55
56
57
58

  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
  // with result in pass1(pass1_output) to calculate final result in stage 7
  // which will be saved into row_idct_output.
59
  vp9_idct16x16_256_add_neon_pass2(input+1,
60
61
62
63
64
65
66
67
68
                                     row_idct_output,
                                     pass1_output,
                                     0,
                                     dest,
                                     dest_stride);

  /* Parallel idct on the lower 8 rows */
  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
  // stage 6 result in pass1_output.
69
  vp9_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8);
70
71
72
73

  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
  // with result in pass1(pass1_output) to calculate final result in stage 7
  // which will be saved into row_idct_output.
74
  vp9_idct16x16_256_add_neon_pass2(input+8*16+1,
75
76
77
78
79
80
81
82
83
                                     row_idct_output+8,
                                     pass1_output,
                                     0,
                                     dest,
                                     dest_stride);

  /* Parallel idct on the left 8 columns */
  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
  // stage 6 result in pass1_output.
84
  vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
85

86
87
88
  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
  // with result in pass1(pass1_output) to calculate final result in stage 7.
  // Then add the result to the destination data.
89
  vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,
90
91
92
93
94
95
96
97
98
                                     row_idct_output,
                                     pass1_output,
                                     1,
                                     dest,
                                     dest_stride);

  /* Parallel idct on the right 8 columns */
  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
  // stage 6 result in pass1_output.
99
  vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
100
101
102
103

  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
  // with result in pass1(pass1_output) to calculate final result in stage 7.
  // Then add the result to the destination data.
104
  vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
105
106
107
108
109
110
                                     row_idct_output+8,
                                     pass1_output,
                                     1,
                                     dest+8,
                                     dest_stride);

111
#if HAVE_NEON_ASM
112
  // restore d8-d15 register values.
113
  vp9_pop_neon(store_reg);
114
#endif
115
116
117
118

  return;
}

119
120
void vp9_idct16x16_10_add_neon(const int16_t *input,
                               uint8_t *dest, int dest_stride) {
121
#if HAVE_NEON_ASM
122
  int64_t store_reg[8];
123
#endif
124
125
126
  int16_t pass1_output[16*16] = {0};
  int16_t row_idct_output[16*16] = {0};

127
#if HAVE_NEON_ASM
128
  // save d8-d15 register values.
129
  vp9_push_neon(store_reg);
130
#endif
131
132
133
134

  /* Parallel idct on the upper 8 rows */
  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
  // stage 6 result in pass1_output.
135
  vp9_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
136
137
138
139

  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
  // with result in pass1(pass1_output) to calculate final result in stage 7
  // which will be saved into row_idct_output.
140
  vp9_idct16x16_10_add_neon_pass2(input+1,
141
142
143
144
145
146
147
148
149
150
151
                                        row_idct_output,
                                        pass1_output,
                                        0,
                                        dest,
                                        dest_stride);

  /* Skip Parallel idct on the lower 8 rows as they are all 0s */

  /* Parallel idct on the left 8 columns */
  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
  // stage 6 result in pass1_output.
152
  vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
153

154
155
156
  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
  // with result in pass1(pass1_output) to calculate final result in stage 7.
  // Then add the result to the destination data.
157
  vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,
158
159
160
161
162
163
164
165
166
                                     row_idct_output,
                                     pass1_output,
                                     1,
                                     dest,
                                     dest_stride);

  /* Parallel idct on the right 8 columns */
  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
  // stage 6 result in pass1_output.
167
  vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
168
169
170
171

  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
  // with result in pass1(pass1_output) to calculate final result in stage 7.
  // Then add the result to the destination data.
172
  vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
173
174
175
176
177
178
                                     row_idct_output+8,
                                     pass1_output,
                                     1,
                                     dest+8,
                                     dest_stride);

179
#if HAVE_NEON_ASM
180
  // restore d8-d15 register values.
181
  vp9_pop_neon(store_reg);
182
#endif
183
184
185

  return;
}