iht4x4_add_neon.c 7.46 KB
Newer Older
Jingning Han's avatar
Jingning Han committed
1
/*
Yaowu Xu's avatar
Yaowu Xu committed
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
Jingning Han's avatar
Jingning Han committed
3
 *
Yaowu Xu's avatar
Yaowu Xu committed
4 5 6 7 8 9
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Jingning Han's avatar
Jingning Han committed
10 11 12 13 14
 */

#include <arm_neon.h>
#include <assert.h>

Yaowu Xu's avatar
Yaowu Xu committed
15
#include "./aom_config.h"
16 17
#include "./av1_rtcd.h"
#include "aom_dsp/txfm_common.h"
18
#include "av1/common/common.h"
Jingning Han's avatar
Jingning Han committed
19

clang-format's avatar
clang-format committed
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
static INLINE void TRANSPOSE4X4(int16x8_t *q8s16, int16x8_t *q9s16) {
  int32x4_t q8s32, q9s32;
  int16x4x2_t d0x2s16, d1x2s16;
  int32x4x2_t q0x2s32;

  d0x2s16 = vtrn_s16(vget_low_s16(*q8s16), vget_high_s16(*q8s16));
  d1x2s16 = vtrn_s16(vget_low_s16(*q9s16), vget_high_s16(*q9s16));

  q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]));
  q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]));
  q0x2s32 = vtrnq_s32(q8s32, q9s32);

  *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]);
  *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]);
  return;
Jingning Han's avatar
Jingning Han committed
35 36
}

clang-format's avatar
clang-format committed
37 38
static INLINE void GENERATE_COSINE_CONSTANTS(int16x4_t *d0s16, int16x4_t *d1s16,
                                             int16x4_t *d2s16) {
39 40 41
  *d0s16 = vdup_n_s16((int16_t)cospi_8_64);
  *d1s16 = vdup_n_s16((int16_t)cospi_16_64);
  *d2s16 = vdup_n_s16((int16_t)cospi_24_64);
clang-format's avatar
clang-format committed
42
  return;
Jingning Han's avatar
Jingning Han committed
43 44
}

clang-format's avatar
clang-format committed
45 46
static INLINE void GENERATE_SINE_CONSTANTS(int16x4_t *d3s16, int16x4_t *d4s16,
                                           int16x4_t *d5s16, int16x8_t *q3s16) {
47 48 49 50
  *d3s16 = vdup_n_s16((int16_t)sinpi_1_9);
  *d4s16 = vdup_n_s16((int16_t)sinpi_2_9);
  *q3s16 = vdupq_n_s16((int16_t)sinpi_3_9);
  *d5s16 = vdup_n_s16((int16_t)sinpi_4_9);
clang-format's avatar
clang-format committed
51
  return;
Jingning Han's avatar
Jingning Han committed
52 53
}

clang-format's avatar
clang-format committed
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
static INLINE void IDCT4x4_1D(int16x4_t *d0s16, int16x4_t *d1s16,
                              int16x4_t *d2s16, int16x8_t *q8s16,
                              int16x8_t *q9s16) {
  int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16;
  int16x4_t d26s16, d27s16, d28s16, d29s16;
  int32x4_t q10s32, q13s32, q14s32, q15s32;
  int16x8_t q13s16, q14s16;

  d16s16 = vget_low_s16(*q8s16);
  d17s16 = vget_high_s16(*q8s16);
  d18s16 = vget_low_s16(*q9s16);
  d19s16 = vget_high_s16(*q9s16);

  d23s16 = vadd_s16(d16s16, d18s16);
  d24s16 = vsub_s16(d16s16, d18s16);

  q15s32 = vmull_s16(d17s16, *d2s16);
  q10s32 = vmull_s16(d17s16, *d0s16);
  q13s32 = vmull_s16(d23s16, *d1s16);
  q14s32 = vmull_s16(d24s16, *d1s16);
  q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16);
  q10s32 = vmlal_s16(q10s32, d19s16, *d2s16);

  d26s16 = vqrshrn_n_s32(q13s32, 14);
  d27s16 = vqrshrn_n_s32(q14s32, 14);
  d29s16 = vqrshrn_n_s32(q15s32, 14);
  d28s16 = vqrshrn_n_s32(q10s32, 14);

  q13s16 = vcombine_s16(d26s16, d27s16);
  q14s16 = vcombine_s16(d28s16, d29s16);
  *q8s16 = vaddq_s16(q13s16, q14s16);
  *q9s16 = vsubq_s16(q13s16, q14s16);
  *q9s16 = vcombine_s16(vget_high_s16(*q9s16), vget_low_s16(*q9s16));  // vswp
  return;
Jingning Han's avatar
Jingning Han committed
88 89
}

clang-format's avatar
clang-format committed
90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
static INLINE void IADST4x4_1D(int16x4_t *d3s16, int16x4_t *d4s16,
                               int16x4_t *d5s16, int16x8_t *q3s16,
                               int16x8_t *q8s16, int16x8_t *q9s16) {
  int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16;
  int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;

  d6s16 = vget_low_s16(*q3s16);

  d16s16 = vget_low_s16(*q8s16);
  d17s16 = vget_high_s16(*q8s16);
  d18s16 = vget_low_s16(*q9s16);
  d19s16 = vget_high_s16(*q9s16);

  q10s32 = vmull_s16(*d3s16, d16s16);
  q11s32 = vmull_s16(*d4s16, d16s16);
  q12s32 = vmull_s16(d6s16, d17s16);
  q13s32 = vmull_s16(*d5s16, d18s16);
  q14s32 = vmull_s16(*d3s16, d18s16);
  q15s32 = vmovl_s16(d16s16);
  q15s32 = vaddw_s16(q15s32, d19s16);
  q8s32 = vmull_s16(*d4s16, d19s16);
  q15s32 = vsubw_s16(q15s32, d18s16);
  q9s32 = vmull_s16(*d5s16, d19s16);

  q10s32 = vaddq_s32(q10s32, q13s32);
  q10s32 = vaddq_s32(q10s32, q8s32);
  q11s32 = vsubq_s32(q11s32, q14s32);
117
  q8s32 = vdupq_n_s32((int32_t)sinpi_3_9);
clang-format's avatar
clang-format committed
118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
  q11s32 = vsubq_s32(q11s32, q9s32);
  q15s32 = vmulq_s32(q15s32, q8s32);

  q13s32 = vaddq_s32(q10s32, q12s32);
  q10s32 = vaddq_s32(q10s32, q11s32);
  q14s32 = vaddq_s32(q11s32, q12s32);
  q10s32 = vsubq_s32(q10s32, q12s32);

  d16s16 = vqrshrn_n_s32(q13s32, 14);
  d17s16 = vqrshrn_n_s32(q14s32, 14);
  d18s16 = vqrshrn_n_s32(q15s32, 14);
  d19s16 = vqrshrn_n_s32(q10s32, 14);

  *q8s16 = vcombine_s16(d16s16, d17s16);
  *q9s16 = vcombine_s16(d18s16, d19s16);
  return;
Jingning Han's avatar
Jingning Han committed
134 135
}

Yaowu Xu's avatar
Yaowu Xu committed
136
void av1_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
137
                            int dest_stride, const TxfmParam *txfm_param) {
clang-format's avatar
clang-format committed
138 139 140 141 142 143 144 145 146 147 148 149 150
  uint8x8_t d26u8, d27u8;
  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16;
  uint32x2_t d26u32, d27u32;
  int16x8_t q3s16, q8s16, q9s16;
  uint16x8_t q8u16, q9u16;

  d26u32 = d27u32 = vdup_n_u32(0);

  q8s16 = vld1q_s16(input);
  q9s16 = vld1q_s16(input + 8);

  TRANSPOSE4X4(&q8s16, &q9s16);

151
  int tx_type = txfm_param->tx_type;
clang-format's avatar
clang-format committed
152 153
  switch (tx_type) {
    case 0:  // idct_idct is not supported. Fall back to C
154
      av1_iht4x4_16_add_c(input, dest, dest_stride, txfm_param);
clang-format's avatar
clang-format committed
155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227
      return;
      break;
    case 1:  // iadst_idct
      // generate constants
      GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);

      // first transform rows
      IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);

      // transpose the matrix
      TRANSPOSE4X4(&q8s16, &q9s16);

      // then transform columns
      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
      break;
    case 2:  // idct_iadst
      // generate constantsyy
      GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);

      // first transform rows
      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);

      // transpose the matrix
      TRANSPOSE4X4(&q8s16, &q9s16);

      // then transform columns
      IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
      break;
    case 3:  // iadst_iadst
      // generate constants
      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);

      // first transform rows
      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);

      // transpose the matrix
      TRANSPOSE4X4(&q8s16, &q9s16);

      // then transform columns
      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
      break;
    default:  // iadst_idct
      assert(0);
      break;
  }

  q8s16 = vrshrq_n_s16(q8s16, 4);
  q9s16 = vrshrq_n_s16(q9s16, 4);

  d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0);
  dest += dest_stride;
  d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1);
  dest += dest_stride;
  d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0);
  dest += dest_stride;
  d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1);

  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));

  d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
  d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));

  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1);
  dest -= dest_stride;
  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0);
  dest -= dest_stride;
  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1);
  dest -= dest_stride;
  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0);
  return;
Jingning Han's avatar
Jingning Han committed
228
}