Project Ne10
An open, optimized software library for the ARM architecture.
NE10_fft_generic_int32.neonintrinsic.h
Go to the documentation of this file.
1 /*
2  * Copyright 2015-16 ARM Limited
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of ARM Limited nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /* license of Kiss FFT */
29 /*
30 Copyright (c) 2003-2010, Mark Borgerding
31 
32 All rights reserved.
33 
34 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
35 
36  * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
37  * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
38  * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
39 
40 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 */
42 
43 /*
44  * NE10 Library : dsp/NE10_fft_generic_int32.neonintrisic.h
45  *
46  * This file must be compiled by C++ toolchain because some functions are
47  * written as template functions to make it easier for compiler to
48  * reduce branch jump.
49  */
50 
51 #ifndef NE10_FFT_GENERIC_INT32_NEONINTRINSIC_H
52 #define NE10_FFT_GENERIC_INT32_NEONINTRINSIC_H
53 
54 #include "NE10_types.h"
55 #include "NE10_macros.h"
56 #include "NE10_fft.neonintrinsic.h"
57 #include "NE10_fft_generic_int32.h"
58 
59 typedef int32x4x2_t CPLX;
60 typedef int32x4_t REAL;
61 #define NE10_REAL_DUP_NEON_S32 vdupq_n_s32
62 
63 #ifndef NE10_INLINE_ASM_OPT
64 #define NE10_CPLX_LOAD(PTR) vld2q_s32 ((ne10_int32_t*) (PTR))
65 #define NE10_CPLX_STORE(PTR,OUT) \
66  do { \
67  vst2q_s32 ((ne10_int32_t*) (PTR), OUT); \
68  } while (0)
69 #else // NE10_INLINE_ASM_OPT
70 #ifndef __aarch64__
71 #error Currently, inline assembly optimizations are only available on AArch64.
72 #else // __aarch64__
73 template<class T>
74 inline static int32x4x2_t NE10_CPLX_LOAD(T *ptr)
75 {
76  int32x4x2_t result;
77  asm volatile (
78  "ld2 {v0.4s, v1.4s}, [%[pin]] \n\t"
79  "mov %[r].16b, v0.16b \n\t"
80  "mov %[i].16b, v1.16b \n\t"
81  : [r]"+w"(result.val[0]),
82  [i]"+w"(result.val[1])
83  : [pin]"r"(ptr)
84  : "memory", "v0", "v1");
85  return result;
86 }
87 
88 template<class T>
89 inline static void NE10_CPLX_STORE(T *ptr, int32x4x2_t out)
90 {
91  asm volatile (
92  "mov v0.16b, %[r].16b \n\t"
93  "mov v1.16b, %[i].16b \n\t"
94  "st2 {v0.4s, v1.4s}, [%[pout]] \n\t"
95  : [r]"+w"(out.val[0]),
96  [i]"+w"(out.val[1])
97  : [pout]"r"(ptr)
98  : "memory", "v0", "v1");
99 }
100 
101 #endif // __aarch64__
102 #endif // NE10_INLINE_ASM_OPT
103 
104 template<>
105 inline CPLX NE10_CPX_LOAD_S<CPLX> (const CPLX *ptr)
106 {
107  return NE10_CPLX_LOAD(ptr);
108 }
109 
110 template<>
111 inline void NE10_CPX_STORE_S<CPLX> (CPLX *ptr, const CPLX out)
112 {
113  NE10_CPLX_STORE (ptr, out);
114 }
115 
116 template<>
117 inline void NE10_LOAD_BY_STEP<1, CPLX> (CPLX out[1],
118  const CPLX *Fin,
119  const ne10_int32_t)
120 {
121  out[0] = NE10_CPX_LOAD_S (Fin);
122 }
123 
124 template<>
126  const CPLX out[1],
127  const ne10_int32_t)
128 {
129  NE10_CPX_STORE_S (Fout, out[0]);
130 }
131 
132 static inline REAL NE10_S_MUL_NEON_S32 (const REAL vec,
133  const ne10_int32_t scalar)
134 {
135  REAL scalar_neon = NE10_REAL_DUP_NEON_S32 (scalar);
136  REAL result = vqrdmulhq_s32 (scalar_neon, vec);
137  return result;
138 }
139 
140 static inline void NE10_CPX_MUL_NEON_S32 (CPLX &result, const CPLX A, const CPLX B)
141 {
142  REAL ARBR = vqrdmulhq_s32 (A.val[0], B.val[0]);
143  REAL ARBI = vqrdmulhq_s32 (A.val[0], B.val[1]);
144  REAL AIBR = vqrdmulhq_s32 (A.val[1], B.val[0]);
145  REAL AIBI = vqrdmulhq_s32 (A.val[1], B.val[1]);
146  result.val[0] = ARBR - AIBI;
147  result.val[1] = ARBI + AIBR;
148 }
149 
150 template<int RADIX>
151 inline void NE10_LOAD_TW_AND_MUL (CPLX scratch_in[RADIX],
152  const ne10_fft_cpx_int32_t *ptr_in,
153  const ne10_int32_t step)
154 {
155  CPLX scratch_tw;
156  int32x2_t d2_tmp = vld1_s32 ((ne10_int32_t *)(ptr_in + (RADIX - 2) * step));
157 
158  scratch_tw.val[0] = NE10_REAL_DUP_NEON_S32 (d2_tmp[0]);
159  scratch_tw.val[1] = NE10_REAL_DUP_NEON_S32 (d2_tmp[1]);
160  NE10_CPX_MUL_NEON_S32 (scratch_in[RADIX - 1], scratch_in[RADIX - 1], scratch_tw);
161 
162  NE10_LOAD_TW_AND_MUL<RADIX - 1> (scratch_in, ptr_in, step);
163 }
164 
165 template<>
166 inline void NE10_LOAD_TW_AND_MUL<1> (CPLX [1],
167  const ne10_fft_cpx_int32_t *,
168  const ne10_int32_t)
169 {
170 }
171 
173 // Conj inplace.
175 template<>
176 inline void NE10_CONJ_S<CPLX> (CPLX &cplx)
177 {
178  cplx.val[1] = -cplx.val[1];
179 }
180 
181 template<>
182 inline void NE10_CONJ<1, CPLX> (CPLX in[1])
183 {
184  NE10_CONJ_S<CPLX> (in[0]);
185 }
186 
188 // Scaling
189 // If Macro NE10_DSP_CFFT_SCALING is not defined, these functions do nothing.
191 template<int RADIX, int SIZE = RADIX>
193  inline void operator() (CPLX scratch_out[RADIX])
194  {
195 #ifdef NE10_DSP_CFFT_SCALING
196  const int32x4_t one_by_RADIX =
197  {
198  (ne10_int32_t) floor(1.0 / RADIX * NE10_F2I32_MAX + 0.5f),
199  (ne10_int32_t) floor(1.0 / RADIX * NE10_F2I32_MAX + 0.5f),
200  (ne10_int32_t) floor(1.0 / RADIX * NE10_F2I32_MAX + 0.5f),
201  (ne10_int32_t) floor(1.0 / RADIX * NE10_F2I32_MAX + 0.5f)
202  };
203  scratch_out[SIZE - 1].val[0] = vqrdmulhq_s32 (scratch_out[SIZE - 1].val[0], one_by_RADIX);
204  scratch_out[SIZE - 1].val[1] = vqrdmulhq_s32 (scratch_out[SIZE - 1].val[1], one_by_RADIX);
205  NE10_FFT_SCALING<RADIX, SIZE - 1> () (scratch_out);
206 #endif
207  }
208 };
209 
210 template<int RADIX>
211 struct NE10_FFT_SCALING<RADIX, 1> {
212  inline void operator () (CPLX scratch_out[1])
213  {
214 #ifdef NE10_DSP_CFFT_SCALING
215  const int32x4_t one_by_RADIX =
216  {
217  (ne10_int32_t) floor(1.0 / RADIX * NE10_F2I32_MAX + 0.5f),
218  (ne10_int32_t) floor(1.0 / RADIX * NE10_F2I32_MAX + 0.5f),
219  (ne10_int32_t) floor(1.0 / RADIX * NE10_F2I32_MAX + 0.5f),
220  (ne10_int32_t) floor(1.0 / RADIX * NE10_F2I32_MAX + 0.5f)
221  };
222  scratch_out[0].val[0] = vqrdmulhq_s32 (scratch_out[0].val[0], one_by_RADIX);
223  scratch_out[0].val[1] = vqrdmulhq_s32 (scratch_out[0].val[1], one_by_RADIX);
224 #endif
225  }
226 };
227 
228 inline void NE10_CPX_ADD_NEON_S32 (CPLX &result, const CPLX a, const CPLX b)
229 {
230  result.val[0] = vaddq_s32 (a.val[0], b.val[0]);
231  result.val[1] = vaddq_s32 (a.val[1], b.val[1]);
232 }
233 
234 inline void NE10_CPX_SUB_NEON_S32 (CPLX &result, const CPLX a, const CPLX b)
235 {
236  result.val[0] = vsubq_s32 (a.val[0], b.val[0]);
237  result.val[1] = vsubq_s32 (a.val[1], b.val[1]);
238 }
239 
240 inline REAL NE10_HALF (REAL src)
241 {
242  const int32x4_t CONST_HALF_NEON = { -1, -1, -1, -1};
243  src = vshlq_s32 (src, CONST_HALF_NEON);
244  return src;
245 }
246 
248 // FFT Kernel
249 // F: Forward
250 // C: Complex
251 // U: Unscaled
253 template<int RADIX>
254 inline void NE10_FFT_FCU_NEON_S32 (CPLX [RADIX], const CPLX [RADIX]);
255 
256 template<>
257 inline void NE10_FFT_FCU_NEON_S32<2> (CPLX scratch_out[2],
258  const CPLX scratch_in[2])
259 {
260  NE10_CPX_ADD_NEON_S32 (scratch_out[0], scratch_in[0], scratch_in[1]);
261  NE10_CPX_SUB_NEON_S32 (scratch_out[1], scratch_in[0], scratch_in[1]);
262 }
263 
264 template<>
265 inline void NE10_FFT_FCU_NEON_S32<3> (CPLX Fout[3],
266  const CPLX Fin[3])
267 {
268  CPLX scratch[4];
269 
270  Fout[0] = Fin[0];
271  Fout[1] = Fin[1];
272  Fout[2] = Fin[2];
273 
274  scratch[1] = Fout[1];
275  scratch[2] = Fout[2];
276 
277  NE10_CPX_ADD_NEON_S32 (scratch[3], scratch[1], scratch[2]);
278  NE10_CPX_SUB_NEON_S32 (scratch[0], scratch[1], scratch[2]);
279 
280  Fout[1].val[0] = Fout[0].val[0] - NE10_HALF (scratch[3].val[0]);
281  Fout[1].val[1] = Fout[0].val[1] - NE10_HALF (scratch[3].val[1]);
282 
283  scratch[0].val[0] = NE10_S_MUL_NEON_S32 (scratch[0].val[0], TW_3IN_S32);
284  scratch[0].val[1] = NE10_S_MUL_NEON_S32 (scratch[0].val[1], TW_3IN_S32);
285 
286  Fout[0].val[0] += scratch[3].val[0];
287  Fout[0].val[1] += scratch[3].val[1];
288 
289  Fout[2].val[0] = Fout[1].val[0] + scratch[0].val[1];
290  Fout[2].val[1] = Fout[1].val[1] - scratch[0].val[0];
291 
292  Fout[1].val[0] -= scratch[0].val[1];
293  Fout[1].val[1] += scratch[0].val[0];
294 }
295 
296 template<>
297 inline void NE10_FFT_FCU_NEON_S32<4> (CPLX scratch_out[4],
298  const CPLX scratch_in[4])
299 {
300  CPLX scratch[4];
301 
302  NE10_CPX_ADD_NEON_S32 (scratch[0], scratch_in[0], scratch_in[2]);
303  NE10_CPX_SUB_NEON_S32 (scratch[1], scratch_in[0], scratch_in[2]);
304  NE10_CPX_ADD_NEON_S32 (scratch[2], scratch_in[1], scratch_in[3]);
305  NE10_CPX_SUB_NEON_S32 (scratch[3], scratch_in[1], scratch_in[3]);
306 
307  NE10_CPX_SUB_NEON_S32 (scratch_out[2], scratch[0], scratch[2]);
308  NE10_CPX_ADD_NEON_S32 (scratch_out[0], scratch[0], scratch[2]);
309 
310  scratch_out[1].val[0] = scratch[1].val[0] + scratch[3].val[1];
311  scratch_out[1].val[1] = scratch[1].val[1] - scratch[3].val[0];
312  scratch_out[3].val[0] = scratch[1].val[0] - scratch[3].val[1];
313  scratch_out[3].val[1] = scratch[1].val[1] + scratch[3].val[0];
314 }
315 
316 template<>
317 inline void NE10_FFT_FCU_NEON_S32<5> (CPLX Fout[5],
318  const CPLX Fin[5])
319 {
320  CPLX scratch[13], scratch_in[5];
321 
322  scratch_in[0] = Fin[0];
323  scratch_in[1] = Fin[1];
324  scratch_in[2] = Fin[2];
325  scratch_in[3] = Fin[3];
326  scratch_in[4] = Fin[4];
327 
328  scratch[0] = scratch_in[0];
329  scratch[1] = scratch_in[1];
330  scratch[2] = scratch_in[2];
331  scratch[3] = scratch_in[3];
332  scratch[4] = scratch_in[4];
333 
334  NE10_CPX_ADD_NEON_S32 (scratch[ 7], scratch[1], scratch[4]);
335  NE10_CPX_SUB_NEON_S32 (scratch[10], scratch[1], scratch[4]);
336  NE10_CPX_ADD_NEON_S32 (scratch[ 8], scratch[2], scratch[3]);
337  NE10_CPX_SUB_NEON_S32 (scratch[ 9], scratch[2], scratch[3]);
338 
339  scratch_in[0].val[0] += scratch[7].val[0] + scratch[8].val[0];
340  scratch_in[0].val[1] += scratch[7].val[1] + scratch[8].val[1];
341 
342  scratch[5].val[0] = scratch[0].val[0]
343  + NE10_S_MUL_NEON_S32 (scratch[7].val[0], TW_5A_S32.r)
344  + NE10_S_MUL_NEON_S32 (scratch[8].val[0], TW_5B_S32.r);
345  scratch[5].val[1] = scratch[0].val[1]
346  + NE10_S_MUL_NEON_S32 (scratch[7].val[1], TW_5A_S32.r)
347  + NE10_S_MUL_NEON_S32 (scratch[8].val[1], TW_5B_S32.r);
348 
349  scratch[6].val[0] = NE10_S_MUL_NEON_S32 (scratch[10].val[1], TW_5A_S32.i)
350  + NE10_S_MUL_NEON_S32 (scratch[9].val[1], TW_5B_S32.i);
351  scratch[6].val[1] = -NE10_S_MUL_NEON_S32 (scratch[10].val[0], TW_5A_S32.i)
352  - NE10_S_MUL_NEON_S32 (scratch[9].val[0], TW_5B_S32.i);
353 
354  NE10_CPX_SUB_NEON_S32 (scratch_in[1], scratch[5], scratch[6]);
355  NE10_CPX_ADD_NEON_S32 (scratch_in[4], scratch[5], scratch[6]);
356 
357  scratch[11].val[0] = scratch[0].val[0]
358  + NE10_S_MUL_NEON_S32 (scratch[7].val[0], TW_5B_S32.r)
359  + NE10_S_MUL_NEON_S32 (scratch[8].val[0], TW_5A_S32.r);
360  scratch[11].val[1] = scratch[0].val[1]
361  + NE10_S_MUL_NEON_S32 (scratch[7].val[1], TW_5B_S32.r)
362  + NE10_S_MUL_NEON_S32 (scratch[8].val[1], TW_5A_S32.r);
363 
364  scratch[12].val[0] = -NE10_S_MUL_NEON_S32 (scratch[10].val[1], TW_5B_S32.i)
365  + NE10_S_MUL_NEON_S32 (scratch[9].val[1], TW_5A_S32.i);
366  scratch[12].val[1] = NE10_S_MUL_NEON_S32 (scratch[10].val[0], TW_5B_S32.i)
367  - NE10_S_MUL_NEON_S32 (scratch[9].val[0], TW_5A_S32.i);
368 
369  NE10_CPX_ADD_NEON_S32 (scratch_in[2], scratch[11], scratch[12]);
370  NE10_CPX_SUB_NEON_S32 (scratch_in[3], scratch[11], scratch[12]);
371 
372  Fout[0] = scratch_in[0];
373  Fout[1] = scratch_in[1];
374  Fout[2] = scratch_in[2];
375  Fout[3] = scratch_in[3];
376  Fout[4] = scratch_in[4];
377 }
378 
380 // Following are butterfly functions
382 template<ne10_int32_t RADIX, bool is_first_stage, bool is_inverse, bool is_scaled>
383 static __attribute__ ((noinline)) void ne10_radix_butterfly_int32_neon (
384  CPLX *Fout,
385  const CPLX *Fin,
386  const ne10_fft_cpx_int32_t *twiddles,
387  const ne10_int32_t fstride,
388  const ne10_int32_t out_step,
389  const ne10_int32_t nfft)
390 {
391  PRINT_HIT;
392  const ne10_int32_t in_step = nfft / RADIX;
393  ne10_int32_t f_count;
394  ne10_int32_t m_count;
395 
396  for (f_count = fstride; f_count > 0; f_count--)
397  {
398  for (m_count = out_step; m_count > 0; m_count--)
399  {
400  CPLX in[RADIX];
401  CPLX out[RADIX];
402 
403  NE10_LOAD_BY_STEP<RADIX, CPLX> (in, Fin, in_step);
404 
405  if (is_inverse)
406  {
407  NE10_CONJ<RADIX> (in);
408  }
409 
410  if (is_scaled)
411  {
412  NE10_FFT_SCALING<RADIX> () (in);
413  }
414 
415  if (!is_first_stage)
416  {
417  NE10_LOAD_TW_AND_MUL<RADIX> (in, twiddles, out_step);
418  }
419 
420  NE10_FFT_FCU_NEON_S32<RADIX> (out, in);
421 
422  if (is_inverse)
423  {
424  NE10_CONJ<RADIX> (out);
425  }
426 
427  NE10_STORE_BY_STEP<RADIX, CPLX> (Fout, out, out_step);
428 
429  Fin++;
430 
431  if (!is_first_stage)
432  {
433  Fout++;
434  twiddles++;
435  }
436  else
437  {
438  Fout += RADIX;
439  }
440  }
441  if (!is_first_stage)
442  {
443  twiddles -= out_step;
444  Fout += (RADIX - 1) * out_step;
445  }
446  }
447 }
448 
449 template<bool is_inverse, bool is_scaled>
450 static void ne10_mixed_radix_generic_butterfly_int32_neon_impl (CPLX *Fout,
451  const CPLX *Fin,
452  const ne10_int32_t *factors,
453  const ne10_fft_cpx_int32_t *twiddles,
454  CPLX *buffer)
455 {
456  ne10_int32_t fstride, mstride, radix;
457  ne10_int32_t stage_count;
458  ne10_int32_t nfft;
459 
460  // init fstride, mstride, radix, nfft
461  stage_count = factors[0];
462  fstride = factors[1];
463  mstride = 1;
464  radix = factors[ stage_count << 1 ]; // radix of first stage
465  nfft = fstride * radix;
466 
467  // swap to make sure output to Fout
468  if (stage_count % 2 == 0)
469  {
470  ne10_swap_ptr (buffer, Fout);
471  }
472 
473  // first stage
474  switch (radix)
475  {
476  case 2:
477  ne10_radix_butterfly_int32_neon<2, true, is_inverse, is_scaled> (Fout, Fin,
478  NULL,
479  fstride, 1, nfft);
480  break;
481  case 4:
482  ne10_radix_butterfly_int32_neon<4, true, is_inverse, is_scaled> (Fout, Fin,
483  NULL,
484  fstride, 1, nfft);
485  break;
486  case 3:
487  ne10_radix_butterfly_int32_neon<3, true, is_inverse, is_scaled> (Fout, Fin,
488  NULL,
489  fstride, 1, nfft);
490  break;
491  case 5:
492  ne10_radix_butterfly_int32_neon<5, true, is_inverse, is_scaled> (Fout, Fin,
493  NULL,
494  fstride, 1, nfft);
495  break;
496  }
497 
498  stage_count--;
499  if (!stage_count) // finish
500  {
501  return;
502  }
503 
504  mstride *= radix;
505 
506  // update radix
507  if (radix % 2)
508  {
509  twiddles += radix;
510  }
511  radix = factors[ stage_count << 1 ];
512 
513  // other stages
514  while (stage_count > 0)
515  {
516  // radix of first stage, should be one of {2,3,5,4}
517  assert ((radix > 1) && (radix < 6));
518 
519  ne10_swap_ptr (buffer, Fout);
520 
521  fstride /= radix;
522  switch (radix)
523  {
524  case 2:
525  ne10_radix_butterfly_int32_neon<2, false, is_inverse, is_scaled> (Fout, buffer,
526  twiddles,
527  fstride, mstride, nfft);
528  break;
529  case 3:
530  ne10_radix_butterfly_int32_neon<3, false, is_inverse, is_scaled> (Fout, buffer,
531  twiddles,
532  fstride, mstride, nfft);
533  break;
534  case 4:
535  ne10_radix_butterfly_int32_neon<4, false, is_inverse, is_scaled> (Fout, buffer,
536  twiddles,
537  fstride, mstride, nfft);
538  break;
539  case 5:
540  ne10_radix_butterfly_int32_neon<5, false, is_inverse, is_scaled> (Fout, buffer,
541  twiddles, fstride, mstride, nfft);
542  break;
543  } // switch (radix)
544 
545  twiddles += mstride * (radix - 1);
546  mstride *= radix;
547 
548  stage_count--;
549  radix = factors[ stage_count << 1 ];
550  } // while (stage_count)
551 }
552 
553 template<bool is_inverse, bool is_scaled>
554 static void ne10_c2c_1d_last_stage_neon (CPLX *Fout,
555  const CPLX *Fin,
556  const ne10_fft_cpx_int32_t *twiddles,
557  const ne10_int32_t fstride,
558  const ne10_int32_t out_step,
559  const ne10_int32_t)
560 {
561  ne10_int32_t f_count;
562  ne10_int32_t m_count;
563 
564  for (f_count = fstride; f_count > 0; f_count--)
565  {
566  CPLX scratch_in[4];
567  CPLX scratch_out[4];
568 
569  for (m_count = out_step / NE10_FFT_PARA_LEVEL; m_count > 0; m_count--)
570  {
571  scratch_in[0] = NE10_CPLX_LOAD (Fin + 0);
572  scratch_in[1] = NE10_CPLX_LOAD (Fin + 1);
573  scratch_in[2] = NE10_CPLX_LOAD (Fin + 2);
574  scratch_in[3] = NE10_CPLX_LOAD (Fin + 3);
575 
576  if (is_scaled)
577  {
578  NE10_FFT_SCALING<4> () (scratch_in);
579  }
580 
581  // Transpose
582  {
583  float32x4x2_t scratch0, scratch_in0;
584  float32x4x2_t scratch1, scratch_in1;
585  float32x4x2_t scratch2, scratch_in2;
586  float32x4x2_t scratch3, scratch_in3;
587 
588  scratch_in0.val[0] = vreinterpretq_f32_s32 (scratch_in[0].val[0]);
589  scratch_in1.val[0] = vreinterpretq_f32_s32 (scratch_in[1].val[0]);
590  scratch_in2.val[0] = vreinterpretq_f32_s32 (scratch_in[2].val[0]);
591  scratch_in3.val[0] = vreinterpretq_f32_s32 (scratch_in[3].val[0]);
592  scratch_in0.val[1] = vreinterpretq_f32_s32 (scratch_in[0].val[1]);
593  scratch_in1.val[1] = vreinterpretq_f32_s32 (scratch_in[1].val[1]);
594  scratch_in2.val[1] = vreinterpretq_f32_s32 (scratch_in[2].val[1]);
595  scratch_in3.val[1] = vreinterpretq_f32_s32 (scratch_in[3].val[1]);
596 
597  NE10_RADIX4X4C_TRANSPOSE_NEON (scratch, scratch_in);
598 
599  scratch_in[0].val[0] = vreinterpretq_s32_f32 (scratch0.val[0]);
600  scratch_in[1].val[0] = vreinterpretq_s32_f32 (scratch1.val[0]);
601  scratch_in[2].val[0] = vreinterpretq_s32_f32 (scratch2.val[0]);
602  scratch_in[3].val[0] = vreinterpretq_s32_f32 (scratch3.val[0]);
603  scratch_in[0].val[1] = vreinterpretq_s32_f32 (scratch0.val[1]);
604  scratch_in[1].val[1] = vreinterpretq_s32_f32 (scratch1.val[1]);
605  scratch_in[2].val[1] = vreinterpretq_s32_f32 (scratch2.val[1]);
606  scratch_in[3].val[1] = vreinterpretq_s32_f32 (scratch3.val[1]);
607  }
608 
609  if (is_inverse)
610  {
611  NE10_CONJ<4, CPLX> (scratch_in);
612  }
613 
614  // Not first stage
615  {
616  CPLX scratch_tw[3];
617 
618  scratch_tw[0] = NE10_CPLX_LOAD (twiddles + 0 * out_step);
619  scratch_tw[1] = NE10_CPLX_LOAD (twiddles + 1 * out_step);
620  scratch_tw[2] = NE10_CPLX_LOAD (twiddles + 2 * out_step);
621 
622  NE10_CPX_MUL_NEON_S32 (scratch_in[1], scratch_in[1], scratch_tw[0]);
623  NE10_CPX_MUL_NEON_S32 (scratch_in[2], scratch_in[2], scratch_tw[1]);
624  NE10_CPX_MUL_NEON_S32 (scratch_in[3], scratch_in[3], scratch_tw[2]);
625  }
626 
627  NE10_FFT_FCU_NEON_S32<4> (scratch_out, scratch_in);
628 
629  if (is_inverse)
630  {
631  NE10_CONJ<4, CPLX> (scratch_out);
632  }
633 
634  // Store.
635  {
636  ne10_fft_cpx_int32_t *Fout_cpx;
637  Fout_cpx = (ne10_fft_cpx_int32_t *) Fout;
638 
639  NE10_CPLX_STORE (Fout_cpx + 0 * out_step, scratch_out[0]);
640  NE10_CPLX_STORE (Fout_cpx + 1 * out_step, scratch_out[1]);
641  NE10_CPLX_STORE (Fout_cpx + 2 * out_step, scratch_out[2]);
642  NE10_CPLX_STORE (Fout_cpx + 3 * out_step, scratch_out[3]);
643  }
644 
645  Fin += 4;
646  Fout += 1;
647  twiddles += 4;
648  }
649  }
650 
651  ne10_int32_t left_over = out_step % 4;
652  if (left_over == 0)
653  {
654  return;
655  }
656 
657  // Left over.
658  const ne10_fft_cpx_int32_t *Fin_s = (ne10_fft_cpx_int32_t *) Fin;
659  ne10_fft_cpx_int32_t *Fout_s = (ne10_fft_cpx_int32_t *) Fout;
660  for (m_count = out_step % 4; m_count > 0; m_count--)
661  {
662  ne10_fft_cpx_int32_t scratch_in[4];
663  ne10_fft_cpx_int32_t scratch_tw[4];
664 
665  scratch_in[0] = Fin_s[0];
666  scratch_in[1] = Fin_s[1];
667  scratch_in[2] = Fin_s[2];
668  scratch_in[3] = Fin_s[3];
669 
670  if (is_scaled)
671  {
672  scratch_in[0].r = scratch_in[0].r >> 2;
673  scratch_in[1].r = scratch_in[1].r >> 2;
674  scratch_in[2].r = scratch_in[2].r >> 2;
675  scratch_in[3].r = scratch_in[3].r >> 2;
676 
677  scratch_in[0].i = scratch_in[0].i >> 2;
678  scratch_in[1].i = scratch_in[1].i >> 2;
679  scratch_in[2].i = scratch_in[2].i >> 2;
680  scratch_in[3].i = scratch_in[3].i >> 2;
681  }
682 
683 
684  if (is_inverse)
685  {
686  scratch_in[0].i = -scratch_in[0].i;
687  scratch_in[1].i = -scratch_in[1].i;
688  scratch_in[2].i = -scratch_in[2].i;
689  scratch_in[3].i = -scratch_in[3].i;
690  }
691 
692  scratch_tw[0] = twiddles[0 * out_step];
693  scratch_tw[1] = twiddles[1 * out_step];
694  scratch_tw[2] = twiddles[2 * out_step];
695 
696  NE10_CPX_MUL_S32 (scratch_in[1], scratch_in[1], scratch_tw[0]);
697  NE10_CPX_MUL_S32 (scratch_in[2], scratch_in[2], scratch_tw[1]);
698  NE10_CPX_MUL_S32 (scratch_in[3], scratch_in[3], scratch_tw[2]);
699 
700  FFT_FCU<4> (scratch_in, scratch_in);
701 
702  if (is_inverse)
703  {
704  scratch_in[0].i = -scratch_in[0].i;
705  scratch_in[1].i = -scratch_in[1].i;
706  scratch_in[2].i = -scratch_in[2].i;
707  scratch_in[3].i = -scratch_in[3].i;
708  }
709 
710  Fout_s[0 * out_step] = scratch_in[0];
711  Fout_s[1 * out_step] = scratch_in[1];
712  Fout_s[2 * out_step] = scratch_in[2];
713  Fout_s[3 * out_step] = scratch_in[3];
714 
715  Fin_s += 4;
716  Fout_s += 1;
717  twiddles += 1;
718  }
719 }
720 
721 #endif
#define NE10_FFT_PARA_LEVEL
Definition: NE10_fft.h:79
void operator()(CPLX scratch_out[RADIX])
CPLX NE10_CPX_LOAD_S< CPLX >(const CPLX *ptr)
int32_t ne10_int32_t
Definition: NE10_types.h:76
void NE10_FFT_FCU_NEON_S32(CPLX[RADIX], const CPLX[RADIX])
#define NE10_CPLX_LOAD(PTR)
#define ne10_swap_ptr(X, Y)
void NE10_LOAD_BY_STEP< 1, CPLX >(CPLX out[1], const CPLX *Fin, const ne10_int32_t)
T NE10_CPX_LOAD_S(const T *ptr)
void NE10_CONJ< 1, CPLX >(CPLX in[1])
#define PRINT_HIT
void NE10_CPX_SUB_NEON_S32(CPLX &result, const CPLX a, const CPLX b)
void NE10_LOAD_TW_AND_MUL(CPLX scratch_in[RADIX], const ne10_fft_cpx_int32_t *ptr_in, const ne10_int32_t step)
Structure for the 32-bit fixed point FFT function.
Definition: NE10_types.h:325
ne10_int32_t i
Definition: NE10_types.h:328
#define NE10_CPLX_STORE(PTR, OUT)
#define NE10_CPX_MUL_S32(Z, A, B)
#define NE10_RADIX4X4C_TRANSPOSE_NEON(Q2_OUT, Q2_IN)
REAL NE10_HALF(REAL src)
#define NE10_F2I32_MAX
Definition: NE10_macros.h:81
#define NE10_REAL_DUP_NEON_S32
ne10_int32_t r
Definition: NE10_types.h:327
void NE10_CONJ_S< CPLX >(CPLX &cplx)
void NE10_CPX_STORE_S< CPLX >(CPLX *ptr, const CPLX out)
void FFT_FCU< 4 >(ne10_fft_cpx_int32_t scratch_out[4], const ne10_fft_cpx_int32_t scratch_in[4])
Basic fixed-point radix-4 butterfly used in each stage.
void NE10_CPX_STORE_S(T *Fout, const T in)
void NE10_FFT_FCU_NEON_S32< 2 >(CPLX scratch_out[2], const CPLX scratch_in[2])
void NE10_FFT_FCU_NEON_S32< 5 >(CPLX Fout[5], const CPLX Fin[5])
void NE10_FFT_FCU_NEON_S32< 3 >(CPLX Fout[3], const CPLX Fin[3])
void NE10_CPX_ADD_NEON_S32(CPLX &result, const CPLX a, const CPLX b)
void NE10_LOAD_TW_AND_MUL< 1 >(CPLX[1], const ne10_fft_cpx_int32_t *, const ne10_int32_t)
void NE10_STORE_BY_STEP< 1, CPLX >(CPLX *Fout, const CPLX out[1], const ne10_int32_t)
void NE10_FFT_FCU_NEON_S32< 4 >(CPLX scratch_out[4], const CPLX scratch_in[4])