idct16x16_neon.c 6.45 KB
Newer Older
1
/*
Yaowu Xu's avatar
Yaowu Xu committed
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3
 *
Yaowu Xu's avatar
Yaowu Xu committed
4 5 6 7 8 9
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 11
 */

Yaowu Xu's avatar
Yaowu Xu committed
12
#include "aom_dsp/aom_dsp_common.h"
13

Yaowu Xu's avatar
Yaowu Xu committed
14
void aom_idct16x16_256_add_neon_pass1(const int16_t *input, int16_t *output,
15
                                      int output_stride);
Yaowu Xu's avatar
Yaowu Xu committed
16
void aom_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *output,
clang-format's avatar
clang-format committed
17 18
                                      int16_t *pass1Output, int16_t skip_adding,
                                      uint8_t *dest, int dest_stride);
Yaowu Xu's avatar
Yaowu Xu committed
19
void aom_idct16x16_10_add_neon_pass1(const int16_t *input, int16_t *output,
20
                                     int output_stride);
Yaowu Xu's avatar
Yaowu Xu committed
21
void aom_idct16x16_10_add_neon_pass2(const int16_t *src, int16_t *output,
clang-format's avatar
clang-format committed
22 23
                                     int16_t *pass1Output, int16_t skip_adding,
                                     uint8_t *dest, int dest_stride);
24

25
#if HAVE_NEON_ASM
26
/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
Yaowu Xu's avatar
Yaowu Xu committed
27 28
extern void aom_push_neon(int64_t *store);
extern void aom_pop_neon(int64_t *store);
29
#endif  // HAVE_NEON_ASM
30

Yaowu Xu's avatar
Yaowu Xu committed
31
void aom_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest,
clang-format's avatar
clang-format committed
32
                                int dest_stride) {
33
#if HAVE_NEON_ASM
34
  int64_t store_reg[8];
35
#endif
clang-format's avatar
clang-format committed
36 37
  int16_t pass1_output[16 * 16] = { 0 };
  int16_t row_idct_output[16 * 16] = { 0 };
38

39
#if HAVE_NEON_ASM
40
  // save d8-d15 register values.
Yaowu Xu's avatar
Yaowu Xu committed
41
  aom_push_neon(store_reg);
42
#endif
43 44 45 46

  /* Parallel idct on the upper 8 rows */
  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
  // stage 6 result in pass1_output.
Yaowu Xu's avatar
Yaowu Xu committed
47
  aom_idct16x16_256_add_neon_pass1(input, pass1_output, 8);
48 49 50 51

  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
  // with result in pass1(pass1_output) to calculate final result in stage 7
  // which will be saved into row_idct_output.
Yaowu Xu's avatar
Yaowu Xu committed
52
  aom_idct16x16_256_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
clang-format's avatar
clang-format committed
53
                                   dest, dest_stride);
54 55 56 57

  /* Parallel idct on the lower 8 rows */
  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
  // stage 6 result in pass1_output.
Yaowu Xu's avatar
Yaowu Xu committed
58
  aom_idct16x16_256_add_neon_pass1(input + 8 * 16, pass1_output, 8);
59 60 61 62

  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
  // with result in pass1(pass1_output) to calculate final result in stage 7
  // which will be saved into row_idct_output.
Yaowu Xu's avatar
Yaowu Xu committed
63
  aom_idct16x16_256_add_neon_pass2(input + 8 * 16 + 1, row_idct_output + 8,
clang-format's avatar
clang-format committed
64
                                   pass1_output, 0, dest, dest_stride);
65 66 67 68

  /* Parallel idct on the left 8 columns */
  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
  // stage 6 result in pass1_output.
Yaowu Xu's avatar
Yaowu Xu committed
69
  aom_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
70

71 72 73
  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
  // with result in pass1(pass1_output) to calculate final result in stage 7.
  // Then add the result to the destination data.
Yaowu Xu's avatar
Yaowu Xu committed
74
  aom_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
clang-format's avatar
clang-format committed
75
                                   pass1_output, 1, dest, dest_stride);
76 77 78 79

  /* Parallel idct on the right 8 columns */
  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
  // stage 6 result in pass1_output.
Yaowu Xu's avatar
Yaowu Xu committed
80
  aom_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8);
81 82 83 84

  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
  // with result in pass1(pass1_output) to calculate final result in stage 7.
  // Then add the result to the destination data.
Yaowu Xu's avatar
Yaowu Xu committed
85
  aom_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
clang-format's avatar
clang-format committed
86 87
                                   row_idct_output + 8, pass1_output, 1,
                                   dest + 8, dest_stride);
88

89
#if HAVE_NEON_ASM
90
  // restore d8-d15 register values.
Yaowu Xu's avatar
Yaowu Xu committed
91
  aom_pop_neon(store_reg);
92
#endif
93 94 95 96

  return;
}

Yaowu Xu's avatar
Yaowu Xu committed
97
void aom_idct16x16_10_add_neon(const int16_t *input, uint8_t *dest,
clang-format's avatar
clang-format committed
98
                               int dest_stride) {
99
#if HAVE_NEON_ASM
100
  int64_t store_reg[8];
101
#endif
clang-format's avatar
clang-format committed
102 103
  int16_t pass1_output[16 * 16] = { 0 };
  int16_t row_idct_output[16 * 16] = { 0 };
104

105
#if HAVE_NEON_ASM
106
  // save d8-d15 register values.
Yaowu Xu's avatar
Yaowu Xu committed
107
  aom_push_neon(store_reg);
108
#endif
109 110 111 112

  /* Parallel idct on the upper 8 rows */
  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
  // stage 6 result in pass1_output.
Yaowu Xu's avatar
Yaowu Xu committed
113
  aom_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
114 115 116 117

  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
  // with result in pass1(pass1_output) to calculate final result in stage 7
  // which will be saved into row_idct_output.
Yaowu Xu's avatar
Yaowu Xu committed
118
  aom_idct16x16_10_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
clang-format's avatar
clang-format committed
119
                                  dest, dest_stride);
120 121 122 123 124 125

  /* Skip Parallel idct on the lower 8 rows as they are all 0s */

  /* Parallel idct on the left 8 columns */
  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
  // stage 6 result in pass1_output.
Yaowu Xu's avatar
Yaowu Xu committed
126
  aom_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
127

128 129 130
  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
  // with result in pass1(pass1_output) to calculate final result in stage 7.
  // Then add the result to the destination data.
Yaowu Xu's avatar
Yaowu Xu committed
131
  aom_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
clang-format's avatar
clang-format committed
132
                                   pass1_output, 1, dest, dest_stride);
133 134 135 136

  /* Parallel idct on the right 8 columns */
  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
  // stage 6 result in pass1_output.
Yaowu Xu's avatar
Yaowu Xu committed
137
  aom_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8);
138 139 140 141

  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
  // with result in pass1(pass1_output) to calculate final result in stage 7.
  // Then add the result to the destination data.
Yaowu Xu's avatar
Yaowu Xu committed
142
  aom_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
clang-format's avatar
clang-format committed
143 144
                                   row_idct_output + 8, pass1_output, 1,
                                   dest + 8, dest_stride);
145

146
#if HAVE_NEON_ASM
147
  // restore d8-d15 register values.
Yaowu Xu's avatar
Yaowu Xu committed
148
  aom_pop_neon(store_reg);
149
#endif
150 151 152

  return;
}