Project Ne10
An open, optimized software library for the ARM architecture.
NE10_fft_generic_float32.neonintrinsic.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2014-16 ARM Limited and Contributors.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of ARM Limited nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /* license of Kiss FFT */
29 /*
30 Copyright (c) 2003-2010, Mark Borgerding
31 
32 All rights reserved.
33 
34 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
35 
36  * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
37  * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
38  * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
39 
40 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 */
42 
43 /*
44  * NE10 Library : dsp/NE10_fft_generic_float32.neonintrisic.cpp
45  *
46  * This file must be compiled by C++ toolchain because some functions are
47  * written as template functions to make it easier for compiler to
48  * reduce branch jump.
49  */
50 
51 #include "NE10_types.h"
52 #include "NE10_macros.h"
53 #include "NE10_fft.neonintrinsic.h"
55 
56 typedef float32x4x2_t CPLX;
57 typedef float32x4_t REAL;
58 #define NE10_REAL_DUP_NEON_F32 vdupq_n_f32
59 #define NE10_CPLX_LOAD(PTR) vld2q_f32 ((ne10_float32_t*) (PTR))
60 #define NE10_CPLX_STORE(PTR,OUT) \
61  do { \
62  vst2q_f32 ((ne10_float32_t*) (PTR), OUT); \
63  } while (0)
64 
65 static inline void NE10_LOAD_TW_AND_MUL (CPLX &scratch_in,
66  const ne10_fft_cpx_float32_t *ptr_in)
67 {
68  CPLX scratch_tw;
69  float32x2_t d2_tmp = vld1_f32 ((ne10_float32_t *)ptr_in);
70  scratch_tw.val[0] = NE10_REAL_DUP_NEON_F32 (d2_tmp[0]);
71  scratch_tw.val[1] = NE10_REAL_DUP_NEON_F32 (d2_tmp[1]);
72  NE10_CPX_MUL_NEON_F32 (scratch_in, scratch_in, scratch_tw);
73 }
74 
75 static inline REAL NE10_S_MUL_NEON_F32 (const REAL vec,
76  const ne10_float32_t scalar)
77 {
78  REAL scalar_neon = NE10_REAL_DUP_NEON_F32 (scalar);
79  REAL result = scalar_neon * vec;
80  return result;
81 }
82 
83 static inline REAL NE10_S_MLA_NEON_F32 (const REAL dst,
84  const REAL src,
85  const ne10_float32_t scalar)
86 {
87  REAL scalar_neon = NE10_REAL_DUP_NEON_F32 (scalar);
88  return vmlaq_f32 (dst, src, scalar_neon);
89 }
90 
91 static inline REAL NE10_S_MLS_NEON_F32 (const REAL dst,
92  const REAL src,
93  const ne10_float32_t scalar)
94 {
95  REAL scalar_neon = NE10_REAL_DUP_NEON_F32 (scalar);
96  return vmlsq_f32 (dst, src, scalar_neon);
97 }
98 
100 // Multiply input with twiddles
102 static inline void NE10_FFT2_MUL_TW_NEON (CPLX scratch_out[2],
103  const CPLX scratch_in[2],
104  const CPLX scratch_tw[1])
105 {
106  scratch_out[0] = scratch_in[0];
107  NE10_CPX_MUL_NEON_F32 (scratch_out[1], scratch_in[1], scratch_tw[0]);
108 }
109 
110 static inline void NE10_FFT3_MUL_TW_NEON (CPLX scratch_out[3],
111  const CPLX scratch_in[3],
112  const CPLX scratch_tw[2])
113 {
114  NE10_FFT2_MUL_TW_NEON (scratch_out, scratch_in, scratch_tw);
115  NE10_CPX_MUL_NEON_F32 (scratch_out[2], scratch_in[2], scratch_tw[1]);
116 }
117 
118 static inline void NE10_FFT4_MUL_TW_NEON (CPLX scratch_out[4],
119  const CPLX scratch_in[4],
120  const CPLX scratch_tw[3])
121 {
122  NE10_FFT3_MUL_TW_NEON (scratch_out, scratch_in, scratch_tw);
123  NE10_CPX_MUL_NEON_F32 (scratch_out[3], scratch_in[3], scratch_tw[2]);
124 }
125 
126 static inline void NE10_FFT5_MUL_TW_NEON (CPLX scratch_out[5],
127  const CPLX scratch_in[5],
128  const CPLX scratch_tw[4])
129 {
130  NE10_FFT4_MUL_TW_NEON (scratch_out, scratch_in, scratch_tw);
131  NE10_CPX_MUL_NEON_F32 (scratch_out[4], scratch_in[4], scratch_tw[3]);
132 }
133 
135 // Conj inplace.
137 static inline void NE10_FFT2_CONJ (CPLX scratch_out[2])
138 {
139  scratch_out[0].val[1] = -scratch_out[0].val[1];
140  scratch_out[1].val[1] = -scratch_out[1].val[1];
141 }
142 
143 static inline void NE10_FFT3_CONJ (CPLX scratch_out[3])
144 {
145  NE10_FFT2_CONJ (scratch_out);
146  scratch_out[2].val[1] = -scratch_out[2].val[1];
147 }
148 
149 static inline void NE10_FFT4_CONJ (CPLX scratch_out[4])
150 {
151  NE10_FFT3_CONJ (scratch_out);
152  scratch_out[3].val[1] = -scratch_out[3].val[1];
153 }
154 
155 static inline void NE10_FFT5_CONJ (CPLX scratch_out[5])
156 {
157  NE10_FFT4_CONJ (scratch_out);
158  scratch_out[4].val[1] = -scratch_out[4].val[1];
159 }
160 
161 static inline void NE10_FFT8_CONJ (CPLX scratch_out[8])
162 {
163  NE10_FFT5_CONJ (scratch_out);
164  scratch_out[5].val[1] = -scratch_out[5].val[1];
165  scratch_out[6].val[1] = -scratch_out[6].val[1];
166  scratch_out[7].val[1] = -scratch_out[7].val[1];
167 }
168 
170 // Scaling
171 // If Macro NE10_DSP_CFFT_SCALING is not defined, these functions do nothing.
173 static inline void NE10_FFT2_SCALING (CPLX scratch_out[2],
174  const REAL one_by_fft_neon)
175 {
176 #ifdef NE10_DSP_CFFT_SCALING
177  scratch_out[0].val[0] *= one_by_fft_neon;
178  scratch_out[0].val[1] *= one_by_fft_neon;
179  scratch_out[1].val[0] *= one_by_fft_neon;
180  scratch_out[1].val[1] *= one_by_fft_neon;
181 #endif
182 }
183 
184 static inline void NE10_FFT3_SCALING (CPLX scratch_out[3],
185  const REAL one_by_fft_neon)
186 {
187 #ifdef NE10_DSP_CFFT_SCALING
188  NE10_FFT2_SCALING (scratch_out, one_by_fft_neon);
189  scratch_out[2].val[0] *= one_by_fft_neon;
190  scratch_out[2].val[1] *= one_by_fft_neon;
191 #endif
192 }
193 
194 static inline void NE10_FFT4_SCALING (CPLX scratch_out[4],
195  const REAL one_by_fft_neon)
196 {
197 #ifdef NE10_DSP_CFFT_SCALING
198  NE10_FFT3_SCALING (scratch_out, one_by_fft_neon);
199  scratch_out[3].val[0] *= one_by_fft_neon;
200  scratch_out[3].val[1] *= one_by_fft_neon;
201 #endif
202 }
203 
204 static inline void NE10_FFT5_SCALING (CPLX scratch_out[5],
205  const REAL one_by_fft_neon)
206 {
207 #ifdef NE10_DSP_CFFT_SCALING
208  NE10_FFT4_SCALING (scratch_out, one_by_fft_neon);
209  scratch_out[4].val[0] *= one_by_fft_neon;
210  scratch_out[4].val[1] *= one_by_fft_neon;
211 #endif
212 }
213 
214 static inline void NE10_FFT8_SCALING (CPLX scratch_out[8],
215  const REAL one_by_fft_neon)
216 {
217 #ifdef NE10_DSP_CFFT_SCALING
218  NE10_FFT5_SCALING (scratch_out, one_by_fft_neon);
219  scratch_out[5].val[0] *= one_by_fft_neon;
220  scratch_out[5].val[1] *= one_by_fft_neon;
221  scratch_out[6].val[0] *= one_by_fft_neon;
222  scratch_out[6].val[1] *= one_by_fft_neon;
223  scratch_out[7].val[0] *= one_by_fft_neon;
224  scratch_out[7].val[1] *= one_by_fft_neon;
225 #endif
226 }
227 
229 // FFT Kernel
230 // F: Forward
231 // C: Complex
232 // U: Unscaled
234 static inline void NE10_FFT2_FUC_NEON_F32 (CPLX scratch_out[2],
235  const CPLX scratch_in[2])
236 {
237  NE10_CPX_ADD_NEON_F32 (scratch_out[0], scratch_in[0], scratch_in[1]);
238  NE10_CPX_SUB_NEON_F32 (scratch_out[1], scratch_in[0], scratch_in[1]);
239 }
240 
241 static inline void NE10_FFT3_FUC_NEON_F32 (CPLX Fout[3],
242  const CPLX Fin[3])
243 {
244  const float32x4_t TW_3IN_NEON_F32 = vdupq_n_f32 (TW_3IN_F32);
245  const float32x4_t HALF_NEON_F32 = vdupq_n_f32 (0.5f);
246 
247  NE10_CPX_ADD_NEON_F32 (Fout[2], Fin[1], Fin[2]);
248  NE10_CPX_SUB_NEON_F32 (Fout[0], Fin[1], Fin[2]);
249 
250  Fout[1].val[0] = Fin[0].val[0] - Fout[2].val[0] * HALF_NEON_F32;
251  Fout[1].val[1] = Fin[0].val[1] - Fout[2].val[1] * HALF_NEON_F32;
252 
253  Fout[0].val[0] = Fout[0].val[0] * TW_3IN_NEON_F32;
254  Fout[0].val[1] = Fout[0].val[1] * TW_3IN_NEON_F32;
255 }
256 
257 static inline void NE10_FFT4_FUC_NEON_F32 (CPLX scratch_out[4],
258  const CPLX scratch_in[4])
259 {
260  CPLX scratch[4];
261 
262  NE10_CPX_ADD_NEON_F32 (scratch[0], scratch_in[0], scratch_in[2]);
263  NE10_CPX_SUB_NEON_F32 (scratch[1], scratch_in[0], scratch_in[2]);
264  NE10_CPX_ADD_NEON_F32 (scratch[2], scratch_in[1], scratch_in[3]);
265  NE10_CPX_SUB_NEON_F32 (scratch[3], scratch_in[1], scratch_in[3]);
266 
267  NE10_CPX_SUB_NEON_F32 (scratch_out[2], scratch[0], scratch[2]);
268  NE10_CPX_ADD_NEON_F32 (scratch_out[0], scratch[0], scratch[2]);
269 
270  scratch_out[1].val[0] = scratch[1].val[0] + scratch[3].val[1];
271  scratch_out[1].val[1] = scratch[1].val[1] - scratch[3].val[0];
272  scratch_out[3].val[0] = scratch[1].val[0] - scratch[3].val[1];
273  scratch_out[3].val[1] = scratch[1].val[1] + scratch[3].val[0];
274 }
275 
276 static inline void NE10_FFT4_FUC_INPLACE_NEON_F32 (CPLX scratch_out[4])
277 {
278  CPLX scratch[4];
279 
280  NE10_CPX_ADD_NEON_F32 (scratch[0], scratch_out[0], scratch_out[2]);
281  NE10_CPX_SUB_NEON_F32 (scratch[1], scratch_out[0], scratch_out[2]);
282  NE10_CPX_ADD_NEON_F32 (scratch[2], scratch_out[1], scratch_out[3]);
283  NE10_CPX_SUB_NEON_F32 (scratch[3], scratch_out[1], scratch_out[3]);
284 
285  NE10_CPX_SUB_NEON_F32 (scratch_out[2], scratch[0], scratch[2]);
286  NE10_CPX_ADD_NEON_F32 (scratch_out[0], scratch[0], scratch[2]);
287 
288  scratch_out[1].val[0] = scratch[1].val[0] + scratch[3].val[1];
289  scratch_out[1].val[1] = scratch[1].val[1] - scratch[3].val[0];
290  scratch_out[3].val[0] = scratch[1].val[0] - scratch[3].val[1];
291  scratch_out[3].val[1] = scratch[1].val[1] + scratch[3].val[0];
292 }
293 
294 static inline void NE10_FFT5_FUC_INPLACE_NEON_F32 (CPLX Fout[5])
295 {
296  CPLX s[6];
297 
298  NE10_CPX_ADD_NEON_F32 (s[1], Fout[1], Fout[4]);
299  NE10_CPX_ADD_NEON_F32 (s[2], Fout[2], Fout[3]);
300 
301  s[0] = Fout[0];
302  s[5] = Fout[0];
303 
304  Fout[0].val[0] = Fout[0].val[0] + s[1].val[0] + s[2].val[0];
305  Fout[0].val[1] = Fout[0].val[1] + s[1].val[1] + s[2].val[1];
306 
307  s[0].val[0] = NE10_S_MLA_NEON_F32 (s[0].val[0], s[1].val[0], TW_5A_F32.r);
308  s[0].val[1] = NE10_S_MLA_NEON_F32 (s[0].val[1], s[1].val[1], TW_5A_F32.r);
309  s[5].val[0] = NE10_S_MLA_NEON_F32 (s[5].val[0], s[1].val[0], TW_5B_F32.r);
310  s[5].val[1] = NE10_S_MLA_NEON_F32 (s[5].val[1], s[1].val[1], TW_5B_F32.r);
311 
312  s[0].val[0] = NE10_S_MLA_NEON_F32 (s[0].val[0], s[2].val[0], TW_5B_F32.r);
313  s[0].val[1] = NE10_S_MLA_NEON_F32 (s[0].val[1], s[2].val[1], TW_5B_F32.r);
314  s[5].val[0] = NE10_S_MLA_NEON_F32 (s[5].val[0], s[2].val[0], TW_5A_F32.r);
315  s[5].val[1] = NE10_S_MLA_NEON_F32 (s[5].val[1], s[2].val[1], TW_5A_F32.r);
316 
317  NE10_CPX_SUB_NEON_F32 (s[4], Fout[1], Fout[4]);
318  NE10_CPX_SUB_NEON_F32 (s[3], Fout[2], Fout[3]);
319 
320  s[1].val[0] = NE10_S_MUL_NEON_F32 (s[4].val[1], TW_5A_F32.i);
321  s[1].val[1] = -NE10_S_MUL_NEON_F32 (s[4].val[0], TW_5A_F32.i);
322  s[2].val[0] = -NE10_S_MUL_NEON_F32 (s[4].val[1], TW_5B_F32.i);
323  s[2].val[1] = NE10_S_MUL_NEON_F32 (s[4].val[0], TW_5B_F32.i);
324 
325  s[1].val[0] = NE10_S_MLA_NEON_F32 (s[1].val[0], s[3].val[1], TW_5B_F32.i);
326  s[1].val[1] = NE10_S_MLS_NEON_F32 (s[1].val[1], s[3].val[0], TW_5B_F32.i);
327  s[2].val[0] = NE10_S_MLA_NEON_F32 (s[2].val[0], s[3].val[1], TW_5A_F32.i);
328  s[2].val[1] = NE10_S_MLS_NEON_F32 (s[2].val[1], s[3].val[0], TW_5A_F32.i);
329 
330  NE10_CPX_SUB_NEON_F32 (Fout[1], s[0], s[1]);
331  NE10_CPX_ADD_NEON_F32 (Fout[4], s[0], s[1]);
332  NE10_CPX_ADD_NEON_F32 (Fout[2], s[5], s[2]);
333  NE10_CPX_SUB_NEON_F32 (Fout[3], s[5], s[2]);
334 }
335 
336 #define NE10_BUTTERFLY_INDEX_NEON_F32(OUT,IN,OUT_I,OUT_J,IN_I,IN_J) \
337  do { \
338  NE10_CPX_ADD_NEON_F32 (OUT[OUT_I],IN[IN_I],IN[IN_J]); \
339  NE10_CPX_SUB_NEON_F32 (OUT[OUT_J],IN[IN_I],IN[IN_J]); \
340  } while (0)
341 
342 static inline void NE10_FFT8_FUC_NEON_F32 (CPLX out[8],
343  const CPLX in[8])
344 {
345  CPLX s[8];
346  const static ne10_fft_cpx_float32_t TW_8[4] =
347  {
348  { 1.00000, 0.00000 },
349  { 0.70711, -0.70711 },
350  { 0.00000, -1.00000 },
351  { -0.70711, -0.70711 },
352  };
353 
354  // STAGE - 1
355  // in -> s
356  {
357  NE10_BUTTERFLY_INDEX_NEON_F32 (s, in, 0, 4, 0, 4);
358  NE10_BUTTERFLY_INDEX_NEON_F32 (s, in, 1, 5, 1, 5);
359  NE10_BUTTERFLY_INDEX_NEON_F32 (s, in, 2, 6, 2, 6);
360  NE10_BUTTERFLY_INDEX_NEON_F32 (s, in, 3, 7, 3, 7);
361  }
362 
363  // STAGE - 2
364  // s -> out
365  {
366  // TW
367 #define NE10_CPX_MUL_TW8_NEON_F32(OUT,TW_8_TABLE,OUT_I,TW_J) \
368  do { \
369  ne10_fft_cpx_float32_t TW_TMP = TW_8_TABLE[TW_J]; \
370  CPLX TW_TMP_NEON; \
371  TW_TMP_NEON.val[0] = NE10_REAL_DUP_NEON_F32 (TW_TMP.r); \
372  TW_TMP_NEON.val[1] = NE10_REAL_DUP_NEON_F32 (TW_TMP.i); \
373  NE10_CPX_MUL_NEON_F32 (OUT[OUT_I],OUT[OUT_I],TW_TMP_NEON); \
374  } while (0)
375 
376  NE10_CPX_MUL_TW8_NEON_F32 (s, TW_8, 4, 0);
377  NE10_CPX_MUL_TW8_NEON_F32 (s, TW_8, 5, 1);
378  NE10_CPX_MUL_TW8_NEON_F32 (s, TW_8, 6, 2);
379  NE10_CPX_MUL_TW8_NEON_F32 (s, TW_8, 7, 3);
380 
381  NE10_BUTTERFLY_INDEX_NEON_F32 (out, s, 0, 2, 0, 2);
382  NE10_BUTTERFLY_INDEX_NEON_F32 (out, s, 1, 3, 1, 3);
383  NE10_BUTTERFLY_INDEX_NEON_F32 (out, s, 4, 6, 4, 6);
384  NE10_BUTTERFLY_INDEX_NEON_F32 (out, s, 5, 7, 5, 7);
385  }
386  // STAGE - 3
387  // out -> s
388  {
389  // TW
390  NE10_CPX_MUL_TW8_NEON_F32 (out, TW_8, 2, 0);
391  NE10_CPX_MUL_TW8_NEON_F32 (out, TW_8, 3, 2);
392  NE10_CPX_MUL_TW8_NEON_F32 (out, TW_8, 6, 0);
393  NE10_CPX_MUL_TW8_NEON_F32 (out, TW_8, 7, 2);
394 #undef NE10_CPX_MUL_TW8_NEON_F32
395 
396  NE10_BUTTERFLY_INDEX_NEON_F32 (s, out, 0, 4, 0, 1);
397  NE10_BUTTERFLY_INDEX_NEON_F32 (s, out, 2, 6, 2, 3);
398  NE10_BUTTERFLY_INDEX_NEON_F32 (s, out, 1, 5, 4, 5);
399  NE10_BUTTERFLY_INDEX_NEON_F32 (s, out, 3, 7, 6, 7);
400  }
401 
402  out[0] = s[0];
403  out[1] = s[1];
404  out[2] = s[2];
405  out[3] = s[3];
406  out[4] = s[4];
407  out[5] = s[5];
408  out[6] = s[6];
409  out[7] = s[7];
410 }
411 
413 // Following are butterfly functions
415 template<ne10_int32_t is_first_stage, ne10_int32_t is_inverse, bool is_scaled>
416 static void ne10_radix_2_butterfly_float32_neon (CPLX *Fout,
417  const CPLX *Fin,
418  const ne10_fft_cpx_float32_t *twiddles,
419  const ne10_int32_t fstride,
420  const ne10_int32_t out_step,
421  const ne10_int32_t nfft)
422 {
423  CPLX in[2];
424  CPLX out[2];
425 
426  const ne10_int32_t in_step = nfft / 2;
427  ne10_int32_t f_count;
428  ne10_int32_t m_count;
429 
430  const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F32 (0.25 / nfft);
431 
432  for (f_count = fstride; f_count > 0; f_count--)
433  {
434  for (m_count = out_step; m_count > 0; m_count--)
435  {
436 #ifndef NE10_INLINE_ASM_OPT
437  in[0] = NE10_CPLX_LOAD (Fin + 0 * in_step);
438  in[1] = NE10_CPLX_LOAD (Fin + 1 * in_step);
439 
440  if (is_inverse == 1)
441  {
442  NE10_FFT2_CONJ (in);
443  }
444 
445  if (is_first_stage == 0)
446  {
447  NE10_LOAD_TW_AND_MUL (in[1], twiddles);
448  }
449 
450  NE10_FFT2_FUC_NEON_F32 (out, in);
451 
452  if (is_inverse == 1)
453  {
454  NE10_FFT2_CONJ (out);
455 
456  if (is_scaled)
457  {
458  NE10_FFT2_SCALING (out, one_by_fft_neon);
459  }
460  }
461 
462  NE10_CPLX_STORE (Fout + 0 * out_step, out[0]);
463  NE10_CPLX_STORE (Fout + 1 * out_step, out[1]);
464 #else // NE10_INLINE_ASM_OPT
465 #ifndef __aarch64__
466 #error Currently, inline assembly optimizations are only available on AArch64.
467 #else // __aarch64__
468  asm volatile (
469  "ld2 {v0.4s, v1.4s}, [%[pin0]] \n\t"
470  "ld2 {v2.4s, v3.4s}, [%[pin1]] \n\t"
471  :
472  : [pin0]"r"(Fin),
473  [pin1]"r"(Fin + in_step)
474  : "memory", "v0", "v1", "v2", "v3");
475 
476  if (is_inverse == 1)
477  {
478  asm volatile (
479  "fneg v1.4s, v1.4s \n\t"
480  "fneg v3.4s, v3.4s \n\t"
481  :
482  :
483  : "v0", "v1", "v2", "v3");
484  }
485 
486  if (is_first_stage == 0)
487  {
488  asm volatile (
489  "ld1 {v4.1d}, [%[ptw]] \n\t" // tw0
490 
491  "fmul v14.4s, v2.4s, v4.s[1] \n\t" // RI
492  "fmul v2.4s, v2.4s, v4.s[0] \n\t" // RR
493  "fmls v2.4s, v3.4s, v4.s[1] \n\t" // RR - II
494  "fmul v15.4s, v3.4s, v4.s[0] \n\t" // IR
495  "fadd v3.4s, v14.4s, v15.4s \n\t" // RI + IR
496  :
497  : [ptw]"r"(twiddles)
498  : "memory", "v0", "v1", "v2", "v3", "v4", "v14", "v15");
499  }
500 
501  asm volatile (
502  "fadd v4.4s, v0.4s, v2.4s \n\t"
503  "fadd v5.4s, v1.4s, v3.4s \n\t"
504  "fsub v6.4s, v0.4s, v2.4s \n\t"
505  "fsub v7.4s, v1.4s, v3.4s \n\t"
506  :
507  :
508  : "memory",
509  "v0", "v1", "v2", "v3", // in
510  "v4", "v5", "v6", "v7"); // out
511 
512  if (is_inverse == 1)
513  {
514  asm volatile (
515  "fneg v5.4s, v5.4s \n\t"
516  "fneg v7.4s, v7.4s \n\t"
517  :
518  :
519  : "v4", "v5", "v6", "v7");
520  }
521 
522  if (is_scaled == 1)
523  {
524  asm volatile (
525  "fmul v4.4s, v4.4s, %[one_by_nfft].4s \n\t"
526  "fmul v5.4s, v5.4s, %[one_by_nfft].4s \n\t"
527  "fmul v6.4s, v6.4s, %[one_by_nfft].4s \n\t"
528  "fmul v7.4s, v7.4s, %[one_by_nfft].4s \n\t"
529  :
530  : [one_by_nfft]"w"(one_by_fft_neon)
531  : "v4", "v5", "v6", "v7");
532  }
533 
534  asm volatile (
535  "st2 {v4.4s, v5.4s}, [%[pout0]] \n\t"
536  "st2 {v6.4s, v7.4s}, [%[pout1]] \n\t"
537  :
538  : [pout0]"r"(Fout),
539  [pout1]"r"(Fout + out_step)
540  : "memory", "v4", "v5", "v6", "v7");
541 #endif // __aarch64__
542 #endif // NE10_INLINE_ASM_OPT
543 
544  Fin++;
545 
546  if (is_first_stage == 0)
547  {
548  Fout++;
549  twiddles++;
550  }
551  else
552  {
553  Fout += 2;
554  }
555  }
556  if (is_first_stage == 0)
557  {
558  twiddles -= out_step;
559  Fout += (2 - 1) * out_step;
560  }
561  }
562 }
563 template<ne10_int32_t is_first_stage, ne10_int32_t is_inverse, bool is_scaled>
564 static void ne10_radix_4_butterfly_float32_neon (CPLX *Fout,
565  const CPLX *Fin,
566  const ne10_fft_cpx_float32_t *twiddles,
567  const ne10_int32_t fstride,
568  const ne10_int32_t out_step,
569  const ne10_int32_t nfft)
570 {
571  CPLX in[4];
572  #ifdef NE10_INLINE_ASM_OPT
573  CPLX s[4];
574  #endif
575 
576  const ne10_int32_t in_step = nfft / 4;
577  ne10_int32_t f_count;
578  ne10_int32_t m_count;
579 
580  const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F32 (0.25 / nfft);
581 
582  for (f_count = fstride; f_count > 0; f_count--)
583  {
584  for (m_count = out_step; m_count > 0; m_count--)
585  {
586 #ifndef NE10_INLINE_ASM_OPT
587  in[0] = NE10_CPLX_LOAD (Fin + 0 * in_step);
588  in[1] = NE10_CPLX_LOAD (Fin + 1 * in_step);
589  in[2] = NE10_CPLX_LOAD (Fin + 2 * in_step);
590  in[3] = NE10_CPLX_LOAD (Fin + 3 * in_step);
591 
592  if (is_inverse == 1)
593  {
594  NE10_FFT4_CONJ (in);
595  }
596 
597  if (is_first_stage == 0)
598  {
599  NE10_LOAD_TW_AND_MUL (in[1], twiddles);
600  NE10_LOAD_TW_AND_MUL (in[2], twiddles + out_step);
601  NE10_LOAD_TW_AND_MUL (in[3], twiddles + out_step * 2);
602  }
603 
604  NE10_FFT4_FUC_INPLACE_NEON_F32 (in);
605 
606  if (is_inverse == 1)
607  {
608  NE10_FFT4_CONJ (in);
609  }
610  if (is_scaled)
611  {
612  NE10_FFT4_SCALING (in, one_by_fft_neon);
613  }
614 
615  NE10_CPLX_STORE (Fout + 0 * out_step, in[0]);
616  NE10_CPLX_STORE (Fout + 1 * out_step, in[1]);
617  NE10_CPLX_STORE (Fout + 2 * out_step, in[2]);
618  NE10_CPLX_STORE (Fout + 3 * out_step, in[3]);
619 #else // NE10_INLINE_ASM_OPT
620 #ifndef __aarch64__
621 #error Currently, inline assembly optimizations are only available on AArch64.
622 #else // __aarch64__
623 #define NEON_REGISTERS_C2C_R4 \
624  "v0", "v1", \
625  "v2", "v3", \
626  "v4", "v5", \
627  "v6", "v7"
628 #define NEON_REGISTERS_C2C_TW_R4 \
629  "v8", "v9", \
630  "v10", "v11", \
631  "v12", "v13"
632 
633  asm volatile (
634  "ld2 {v0.4s, v1.4s}, [%[pin0]] \n\t" // in[0]
635  "ld2 {v2.4s, v3.4s}, [%[pin1]] \n\t" // in[1]
636  "ld2 {v4.4s, v5.4s}, [%[pin2]] \n\t" // in[2]
637  "ld2 {v6.4s, v7.4s}, [%[pin3]] \n\t" // in[3]
638  :
639  : [pin0]"r"(Fin),
640  [pin1]"r"(Fin + in_step),
641  [pin2]"r"(Fin + in_step * 2),
642  [pin3]"r"(Fin + in_step * 3)
643  : "memory", NEON_REGISTERS_C2C_R4);
644 
645  if (is_inverse == 1)
646  {
647  asm volatile (
648  "fneg v1.4s, v1.4s \n\t"
649  "fneg v3.4s, v3.4s \n\t"
650  "fneg v5.4s, v5.4s \n\t"
651  "fneg v7.4s, v7.4s \n\t"
652  :
653  :
654  : NEON_REGISTERS_C2C_R4);
655  }
656 
657  if (is_first_stage == 0)
658  {
659  asm volatile (
660  "ld1 { v8.1d}, [%[ptw0]] \n\t" // tw0
661  "ld1 { v9.1d}, [%[ptw1]] \n\t" // tw1
662  "ld1 {v10.1d}, [%[ptw2]] \n\t" // tw2
663 
664  "fmul v14.4s, v2.4s, v8.s[1] \n\t" // RI
665  "fmul v2.4s, v2.4s, v8.s[0] \n\t" // RR
666  "fmls v2.4s, v3.4s, v8.s[1] \n\t" // RR - II
667  "fmul v15.4s, v3.4s, v8.s[0] \n\t" // IR
668  "fadd v3.4s, v14.4s, v15.4s \n\t" // RI + IR
669 
670  "fmul v14.4s, v4.4s, v9.s[1] \n\t" // RI
671  "fmul v4.4s, v4.4s, v9.s[0] \n\t" // RR
672  "fmls v4.4s, v5.4s, v9.s[1] \n\t" // RR - II
673  "fmul v15.4s, v5.4s, v9.s[0] \n\t" // IR
674  "fadd v5.4s, v14.4s, v15.4s \n\t" // RI + IR
675 
676  "fmul v14.4s, v6.4s, v10.s[1] \n\t" // RI
677  "fmul v6.4s, v6.4s, v10.s[0] \n\t" // RR
678  "fmls v6.4s, v7.4s, v10.s[1] \n\t" // RR - II
679  "fmul v15.4s, v7.4s, v10.s[0] \n\t" // IR
680  "fadd v7.4s, v14.4s, v15.4s \n\t" // RI + IR
681  :
682  : [ptw0]"r"(twiddles),
683  [ptw1]"r"(twiddles + out_step),
684  [ptw2]"r"(twiddles + out_step * 2)
685  : "memory", NEON_REGISTERS_C2C_R4,
686  NEON_REGISTERS_C2C_TW_R4, "v14", "v15");
687  }
688 
689  asm volatile (
690  "fadd %[s0r].4s, v0.4s, v4.4s \n\t"
691  "fadd %[s0i].4s, v1.4s, v5.4s \n\t"
692 
693  "fsub %[s1r].4s, v0.4s, v4.4s \n\t"
694  "fsub %[s1i].4s, v1.4s, v5.4s \n\t"
695 
696  "fadd %[s2r].4s, v2.4s, v6.4s \n\t"
697  "fadd %[s2i].4s, v3.4s, v7.4s \n\t"
698 
699  "fsub %[s3r].4s, v2.4s, v6.4s \n\t"
700  "fsub %[s3i].4s, v3.4s, v7.4s \n\t"
701  : [s0r]"+w"(s[0].val[0]),
702  [s0i]"+w"(s[0].val[1]),
703  [s1r]"+w"(s[1].val[0]),
704  [s1i]"+w"(s[1].val[1]),
705  [s2r]"+w"(s[2].val[0]),
706  [s2i]"+w"(s[2].val[1]),
707  [s3r]"+w"(s[3].val[0]),
708  [s3i]"+w"(s[3].val[1])
709  :
710  : NEON_REGISTERS_C2C_R4);
711 
712  asm volatile (
713  "fadd v0.4s, %[s0r].4s, %[s2r].4s \n\t"
714  "fadd v1.4s, %[s0i].4s, %[s2i].4s \n\t"
715  "fsub v4.4s, %[s0r].4s, %[s2r].4s \n\t"
716  "fsub v5.4s, %[s0i].4s, %[s2i].4s \n\t"
717 
718  "fadd v2.4s, %[s1r].4s, %[s3i].4s \n\t"
719  "fsub v3.4s, %[s1i].4s, %[s3r].4s \n\t"
720  "fsub v6.4s, %[s1r].4s, %[s3i].4s \n\t"
721  "fadd v7.4s, %[s1i].4s, %[s3r].4s \n\t"
722  :
723  : [s0r]"w"(s[0].val[0]),
724  [s0i]"w"(s[0].val[1]),
725  [s1r]"w"(s[1].val[0]),
726  [s1i]"w"(s[1].val[1]),
727  [s2r]"w"(s[2].val[0]),
728  [s2i]"w"(s[2].val[1]),
729  [s3r]"w"(s[3].val[0]),
730  [s3i]"w"(s[3].val[1])
731  : NEON_REGISTERS_C2C_R4);
732 
733  if (is_inverse == 1)
734  {
735  asm volatile (
736  "fneg v1.4s, v1.4s \n\t"
737  "fneg v3.4s, v3.4s \n\t"
738  "fneg v5.4s, v5.4s \n\t"
739  "fneg v7.4s, v7.4s \n\t"
740  :
741  :
742  : NEON_REGISTERS_C2C_R4);
743  }
744 
745  if (is_scaled)
746  {
747  asm volatile (
748  "fmul v0.4s, v0.4s, %[one_by_nfft].4s \n\t"
749  "fmul v1.4s, v1.4s, %[one_by_nfft].4s \n\t"
750  "fmul v2.4s, v2.4s, %[one_by_nfft].4s \n\t"
751  "fmul v3.4s, v3.4s, %[one_by_nfft].4s \n\t"
752  "fmul v4.4s, v4.4s, %[one_by_nfft].4s \n\t"
753  "fmul v5.4s, v5.4s, %[one_by_nfft].4s \n\t"
754  "fmul v6.4s, v6.4s, %[one_by_nfft].4s \n\t"
755  "fmul v7.4s, v7.4s, %[one_by_nfft].4s \n\t"
756  :
757  : [one_by_nfft]"w"(one_by_fft_neon)
758  : NEON_REGISTERS_C2C_R4);
759  }
760 
761  asm volatile (
762  "st2 {v0.4s, v1.4s}, [%[pout0]] \n\t"
763  "st2 {v2.4s, v3.4s}, [%[pout1]] \n\t"
764  "st2 {v4.4s, v5.4s}, [%[pout2]] \n\t"
765  "st2 {v6.4s, v7.4s}, [%[pout3]] \n\t"
766  :
767  : [pout0]"r"(Fout),
768  [pout1]"r"(Fout + out_step),
769  [pout2]"r"(Fout + out_step * 2),
770  [pout3]"r"(Fout + out_step * 3)
771  : NEON_REGISTERS_C2C_R4);
772 #endif // __aarch64__
773 #endif // NE10_INLINE_ASM_OPT
774  Fin++;
775 
776  if (is_first_stage == 0)
777  {
778  Fout++;
779  twiddles++;
780  }
781  else
782  {
783  Fout += 4;
784  }
785  }
786  if (is_first_stage == 0)
787  {
788  twiddles -= out_step;
789  Fout += (4 - 1) * out_step;
790  }
791  }
792 }
793 
794 template<ne10_int32_t is_first_stage, ne10_int32_t is_inverse, bool is_scaled>
795 static void ne10_radix_3_butterfly_float32_neon (CPLX *Fout,
796  const CPLX *Fin,
797  const ne10_fft_cpx_float32_t *twiddles,
798  const ne10_int32_t fstride,
799  const ne10_int32_t out_step,
800  const ne10_int32_t nfft)
801 {
802  CPLX in[3];
803  CPLX out[3];
804  CPLX s[4];
805 
806  const ne10_int32_t in_step = nfft / 3;
807  ne10_int32_t f_count;
808  ne10_int32_t m_count;
809 
810  const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F32 (0.25 / nfft);
811  const float32x4_t TW_3IN_NEON_F32 = vdupq_n_f32 (TW_3IN_F32);
812  const float32x4_t HALF_NEON_F32 = vdupq_n_f32 (0.5f);
813 
814  for (f_count = fstride; f_count > 0; f_count--)
815  {
816  for (m_count = out_step; m_count > 0; m_count--)
817  {
818 #ifndef NE10_INLINE_ASM_OPT
819  in[0] = NE10_CPLX_LOAD (Fin + 0 * in_step);
820  in[1] = NE10_CPLX_LOAD (Fin + 1 * in_step);
821  in[2] = NE10_CPLX_LOAD (Fin + 2 * in_step);
822 
823  if (is_inverse == 1)
824  {
825  NE10_FFT3_CONJ (in);
826  }
827 
828  if (is_first_stage == 0)
829  {
830  NE10_LOAD_TW_AND_MUL (in[1], twiddles);
831  NE10_LOAD_TW_AND_MUL (in[2], twiddles + out_step);
832  }
833 
834  NE10_CPX_ADD_NEON_F32 (s[2], in[1], in[2]);
835  NE10_CPX_SUB_NEON_F32 (s[0], in[1], in[2]);
836  s[3] = in[0];
837 
838  s[1].val[0] = - s[2].val[0] * HALF_NEON_F32;
839  s[1].val[1] = - s[2].val[1] * HALF_NEON_F32;
840 
841  s[1].val[0] += s[3].val[0];
842  s[1].val[1] += s[3].val[1];
843  s[0].val[0] *= TW_3IN_NEON_F32;
844  s[0].val[1] *= TW_3IN_NEON_F32;
845 
846  out[0].val[0] = s[3].val[0] + s[2].val[0];
847  out[0].val[1] = s[3].val[1] + s[2].val[1];
848  out[1].val[0] = s[1].val[0] - s[0].val[1];
849  out[1].val[1] = s[1].val[1] + s[0].val[0];
850  out[2].val[0] = s[1].val[0] + s[0].val[1];
851  out[2].val[1] = s[1].val[1] - s[0].val[0];
852 
853  if (is_inverse == 1)
854  {
855  NE10_FFT3_CONJ (out);
856  }
857  if (is_scaled)
858  {
859  NE10_FFT3_SCALING (out, one_by_fft_neon);
860  }
861 
862  NE10_CPLX_STORE (Fout + 0 * out_step, out[0]);
863  NE10_CPLX_STORE (Fout + 1 * out_step, out[1]);
864  NE10_CPLX_STORE (Fout + 2 * out_step, out[2]);
865 #else // NE10_INLINE_ASM_OPT
866 #ifndef __aarch64__
867 #error Currently, inline assembly optimizations are only available on AArch64.
868 #else // __aarch64__
869  asm volatile (
870  "ld2 {v0.4s, v1.4s}, [%[pin0]] \n\t" // in[0]
871  "ld2 {v2.4s, v3.4s}, [%[pin1]] \n\t" // in[1]
872  "ld2 {v4.4s, v5.4s}, [%[pin2]] \n\t" // in[2]
873  :
874  : [pin0]"r"(Fin),
875  [pin1]"r"(Fin + in_step),
876  [pin2]"r"(Fin + in_step * 2)
877  : "memory", "v0", "v1", "v2", "v3", "v4", "v5");
878 
879  if (is_inverse == 1)
880  {
881  asm volatile (
882  "fneg v1.4s, v1.4s \n\t"
883  "fneg v3.4s, v3.4s \n\t"
884  "fneg v5.4s, v5.4s \n\t"
885  :
886  :
887  : "v1", "v3", "v5");
888  }
889 
890  if (is_first_stage == 0)
891  {
892  asm volatile (
893  // Load twiddles.
894  "ld1 {v6.1d}, [%[ptw0]] \n\t" // tw0
895  "ld1 {v7.1d}, [%[ptw1]] \n\t" // tw1
896  // in[1] = in[1] * tw[0]
897  "fmul v10.4s, v2.4s, v6.s[1] \n\t" // RI
898  "fmul v2.4s, v2.4s, v6.s[0] \n\t" // RR
899  "fmls v2.4s, v3.4s, v6.s[1] \n\t" // RR - II
900  "fmul v11.4s, v3.4s, v6.s[0] \n\t" // IR
901  "fadd v3.4s, v10.4s, v11.4s \n\t" // RI + IR
902  // in[2] = in[2] * tw[1]
903  "fmul v10.4s, v4.4s, v7.s[1] \n\t" // RI
904  "fmul v4.4s, v4.4s, v7.s[0] \n\t" // RR
905  "fmls v4.4s, v5.4s, v7.s[1] \n\t" // RR - II
906  "fmul v11.4s, v5.4s, v7.s[0] \n\t" // IR
907  "fadd v5.4s, v10.4s, v11.4s \n\t" // RI + IR
908  :
909  : [ptw0]"r"(twiddles),
910  [ptw1]"r"(twiddles + out_step)
911  : "memory", "v0", "v1", "v2", "v3", "v4", "v5",
912  "v6", "v7", "v8", "v9",
913  "v10", "v11");
914  }
915 
916  asm volatile (
917  "fadd %[s2r].4s, v2.4s, v4.4s \n\t"
918  "fadd %[s2i].4s, v3.4s, v5.4s \n\t"
919 
920  "fsub %[s0r].4s, v2.4s, v4.4s \n\t"
921  "fsub %[s0i].4s, v3.4s, v5.4s \n\t"
922 
923  "mov %[s3r].16b, v0.16b \n\t"
924  "mov %[s3i].16b, v1.16b \n\t"
925  : [s0r]"+w"(s[0].val[0]),
926  [s0i]"+w"(s[0].val[1]),
927  [s2r]"+w"(s[2].val[0]),
928  [s2i]"+w"(s[2].val[1]),
929  [s3r]"+w"(s[3].val[0]),
930  [s3i]"+w"(s[3].val[1])
931  :
932  : "v0", "v1", "v2", "v3", "v4", "v5");
933 
934  s[1].val[0] = - s[2].val[0] * HALF_NEON_F32;
935  s[1].val[1] = - s[2].val[1] * HALF_NEON_F32;
936 
937  s[1].val[0] += s[3].val[0];
938  s[1].val[1] += s[3].val[1];
939  s[0].val[0] *= TW_3IN_NEON_F32;
940  s[0].val[1] *= TW_3IN_NEON_F32;
941 
942  // out[0] - {v0.4s, v1.4s}
943  // out[1] - {v2.4s, v3.4s}
944  // out[2] - {v4.4s, v5.4s}
945  asm volatile (
946  "fadd v0.4s, %[s3r].4s, %[s2r].4s \n\t"
947  "fadd v1.4s, %[s3i].4s, %[s2i].4s \n\t"
948  "fsub v2.4s, %[s1r].4s, %[s0i].4s \n\t"
949  "fadd v3.4s, %[s1i].4s, %[s0r].4s \n\t"
950  "fadd v4.4s, %[s1r].4s, %[s0i].4s \n\t"
951  "fsub v5.4s, %[s1i].4s, %[s0r].4s \n\t"
952  :
953  : [s0r]"w"(s[0].val[0]),
954  [s0i]"w"(s[0].val[1]),
955  [s1r]"w"(s[1].val[0]),
956  [s1i]"w"(s[1].val[1]),
957  [s2r]"w"(s[2].val[0]),
958  [s2i]"w"(s[2].val[1]),
959  [s3r]"w"(s[3].val[0]),
960  [s3i]"w"(s[3].val[1])
961  : "v0", "v1", "v2", "v3", "v4", "v5");
962 
963  if (is_inverse == 1)
964  {
965  asm volatile (
966  "fneg v1.4s, v1.4s \n\t"
967  "fneg v3.4s, v3.4s \n\t"
968  "fneg v5.4s, v5.4s \n\t"
969  :
970  :
971  : "v1", "v3", "v5");
972  }
973 
974  if (is_scaled)
975  {
976  asm volatile (
977  "fmul v0.4s, v0.4s, %[one_by_nfft].4s \n\t"
978  "fmul v1.4s, v1.4s, %[one_by_nfft].4s \n\t"
979  "fmul v2.4s, v2.4s, %[one_by_nfft].4s \n\t"
980  "fmul v3.4s, v3.4s, %[one_by_nfft].4s \n\t"
981  "fmul v4.4s, v4.4s, %[one_by_nfft].4s \n\t"
982  "fmul v5.4s, v5.4s, %[one_by_nfft].4s \n\t"
983  :
984  : [one_by_nfft]"w"(one_by_fft_neon)
985  : "v0", "v1", "v2", "v3", "v4", "v5");
986  }
987 
988  asm volatile (
989  "st2 {v0.4s, v1.4s}, [%[pout0]] \n\t"
990  "st2 {v2.4s, v3.4s}, [%[pout1]] \n\t"
991  "st2 {v4.4s, v5.4s}, [%[pout2]] \n\t"
992  :
993  : [pout0]"r"(Fout),
994  [pout1]"r"(Fout + out_step),
995  [pout2]"r"(Fout + 2 * out_step)
996  : "memory", "v0", "v1", "v2", "v3", "v4", "v5");
997 #endif // __aarch64__
998 #endif // NE10_INLINE_ASM_OPT
999 
1000  Fin++;
1001 
1002  if (is_first_stage == 0)
1003  {
1004  Fout++;
1005  twiddles++;
1006  }
1007  else
1008  {
1009  Fout += 3;
1010  }
1011  }
1012  if (is_first_stage == 0)
1013  {
1014  twiddles -= out_step;
1015  Fout += (3 - 1) * out_step;
1016  }
1017  }
1018 }
1019 
1020 template<ne10_int32_t is_first_stage, ne10_int32_t is_inverse, bool is_scaled>
1021 static void ne10_radix_5_butterfly_float32_neon (CPLX *Fout,
1022  const CPLX *Fin,
1023  const ne10_fft_cpx_float32_t *twiddles,
1024  const ne10_int32_t fstride,
1025  const ne10_int32_t out_step,
1026  const ne10_int32_t nfft)
1027 {
1028  CPLX in[5];
1029  CPLX s[6];
1030 
1031  const ne10_int32_t in_step = nfft / 5;
1032  ne10_int32_t f_count;
1033  ne10_int32_t m_count;
1034 
1035  const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F32 (0.25 / nfft);
1036 
1037  for (f_count = fstride; f_count > 0; f_count--)
1038  {
1039  for (m_count = out_step; m_count > 0; m_count--)
1040  {
1041  in[0] = NE10_CPLX_LOAD (Fin + 0 * in_step);
1042  in[1] = NE10_CPLX_LOAD (Fin + 1 * in_step);
1043  in[2] = NE10_CPLX_LOAD (Fin + 2 * in_step);
1044  in[3] = NE10_CPLX_LOAD (Fin + 3 * in_step);
1045  in[4] = NE10_CPLX_LOAD (Fin + 4 * in_step);
1046 
1047  if (is_inverse == 1)
1048  {
1049  NE10_FFT5_CONJ (in);
1050  }
1051 
1052  if (is_first_stage == 0)
1053  {
1054  NE10_LOAD_TW_AND_MUL (in[1], twiddles);
1055  NE10_LOAD_TW_AND_MUL (in[2], twiddles + out_step);
1056  NE10_LOAD_TW_AND_MUL (in[3], twiddles + out_step * 2);
1057  NE10_LOAD_TW_AND_MUL (in[4], twiddles + out_step * 3);
1058  }
1059 
1060  NE10_CPX_ADD_NEON_F32 (s[1], in[1], in[4]);
1061  NE10_CPX_ADD_NEON_F32 (s[2], in[2], in[3]);
1062 
1063  s[0] = in[0];
1064  s[5] = in[0];
1065 
1066  in[0].val[0] = in[0].val[0] + s[1].val[0] + s[2].val[0];
1067  in[0].val[1] = in[0].val[1] + s[1].val[1] + s[2].val[1];
1068 
1069  s[0].val[0] = NE10_S_MLA_NEON_F32 (s[0].val[0], s[1].val[0], TW_5A_F32.r);
1070  s[0].val[1] = NE10_S_MLA_NEON_F32 (s[0].val[1], s[1].val[1], TW_5A_F32.r);
1071  s[5].val[0] = NE10_S_MLA_NEON_F32 (s[5].val[0], s[1].val[0], TW_5B_F32.r);
1072  s[5].val[1] = NE10_S_MLA_NEON_F32 (s[5].val[1], s[1].val[1], TW_5B_F32.r);
1073 
1074  s[0].val[0] = NE10_S_MLA_NEON_F32 (s[0].val[0], s[2].val[0], TW_5B_F32.r);
1075  s[0].val[1] = NE10_S_MLA_NEON_F32 (s[0].val[1], s[2].val[1], TW_5B_F32.r);
1076  s[5].val[0] = NE10_S_MLA_NEON_F32 (s[5].val[0], s[2].val[0], TW_5A_F32.r);
1077  s[5].val[1] = NE10_S_MLA_NEON_F32 (s[5].val[1], s[2].val[1], TW_5A_F32.r);
1078 
1079  NE10_CPX_SUB_NEON_F32 (s[4], in[1], in[4]);
1080  NE10_CPX_SUB_NEON_F32 (s[3], in[2], in[3]);
1081 
1082  s[1].val[0] = NE10_S_MUL_NEON_F32 (s[4].val[1], TW_5A_F32.i);
1083  s[1].val[1] = -NE10_S_MUL_NEON_F32 (s[4].val[0], TW_5A_F32.i);
1084  s[2].val[0] = -NE10_S_MUL_NEON_F32 (s[4].val[1], TW_5B_F32.i);
1085  s[2].val[1] = NE10_S_MUL_NEON_F32 (s[4].val[0], TW_5B_F32.i);
1086 
1087  s[1].val[0] = NE10_S_MLA_NEON_F32 (s[1].val[0], s[3].val[1], TW_5B_F32.i);
1088  s[1].val[1] = NE10_S_MLS_NEON_F32 (s[1].val[1], s[3].val[0], TW_5B_F32.i);
1089  s[2].val[0] = NE10_S_MLA_NEON_F32 (s[2].val[0], s[3].val[1], TW_5A_F32.i);
1090  s[2].val[1] = NE10_S_MLS_NEON_F32 (s[2].val[1], s[3].val[0], TW_5A_F32.i);
1091 
1092  NE10_CPX_SUB_NEON_F32 (in[1], s[0], s[1]);
1093  NE10_CPX_ADD_NEON_F32 (in[4], s[0], s[1]);
1094  NE10_CPX_ADD_NEON_F32 (in[2], s[5], s[2]);
1095  NE10_CPX_SUB_NEON_F32 (in[3], s[5], s[2]);
1096 
1097  if (is_inverse == 1)
1098  {
1099  NE10_FFT5_CONJ (in);
1100  }
1101  if (is_scaled)
1102  {
1103  NE10_FFT5_SCALING (in, one_by_fft_neon);
1104  }
1105 
1106  NE10_CPLX_STORE (Fout + 0 * out_step, in[0]);
1107  NE10_CPLX_STORE (Fout + 1 * out_step, in[1]);
1108  NE10_CPLX_STORE (Fout + 2 * out_step, in[2]);
1109  NE10_CPLX_STORE (Fout + 3 * out_step, in[3]);
1110  NE10_CPLX_STORE (Fout + 4 * out_step, in[4]);
1111 
1112  Fin++;
1113 
1114  if (is_first_stage == 0)
1115  {
1116  Fout++;
1117  twiddles++;
1118  }
1119  else
1120  {
1121  Fout += 5;
1122  }
1123  }
1124  if (is_first_stage == 0)
1125  {
1126  twiddles -= out_step;
1127  Fout += (5 - 1) * out_step;
1128  }
1129  }
1130 }
1131 
1132 template<ne10_int32_t is_first_stage, ne10_int32_t is_inverse, bool is_scaled>
1133 static void ne10_radix_8_butterfly_float32_neon (CPLX *Fout,
1134  const CPLX *Fin,
1135  const ne10_fft_cpx_float32_t *,
1136  const ne10_int32_t fstride,
1137  const ne10_int32_t out_step,
1138  const ne10_int32_t nfft)
1139 {
1140  CPLX in[8];
1141  CPLX out[8];
1142 
1143  const ne10_int32_t in_step = nfft / 8;
1144  ne10_int32_t f_count;
1145  ne10_int32_t m_count;
1146 
1147  const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F32 (0.25 / nfft);
1148 
1149  for (f_count = fstride; f_count > 0; f_count--)
1150  {
1151  for (m_count = out_step; m_count > 0; m_count--)
1152  {
1153  in[0] = NE10_CPLX_LOAD (Fin + 0 * in_step);
1154  in[1] = NE10_CPLX_LOAD (Fin + 1 * in_step);
1155  in[2] = NE10_CPLX_LOAD (Fin + 2 * in_step);
1156  in[3] = NE10_CPLX_LOAD (Fin + 3 * in_step);
1157  in[4] = NE10_CPLX_LOAD (Fin + 4 * in_step);
1158  in[5] = NE10_CPLX_LOAD (Fin + 5 * in_step);
1159  in[6] = NE10_CPLX_LOAD (Fin + 6 * in_step);
1160  in[7] = NE10_CPLX_LOAD (Fin + 7 * in_step);
1161 
1162  if (is_inverse == 1)
1163  {
1164  NE10_FFT8_CONJ (in);
1165  }
1166 
1167  NE10_FFT8_FUC_NEON_F32 (out, in);
1168 
1169  if (is_inverse == 1)
1170  {
1171  NE10_FFT8_CONJ (out);
1172  }
1173  if (is_scaled)
1174  {
1175  NE10_FFT8_SCALING (out, one_by_fft_neon);
1176  }
1177 
1178  NE10_CPLX_STORE (Fout + 0 * out_step, out[0]);
1179  NE10_CPLX_STORE (Fout + 1 * out_step, out[1]);
1180  NE10_CPLX_STORE (Fout + 2 * out_step, out[2]);
1181  NE10_CPLX_STORE (Fout + 3 * out_step, out[3]);
1182  NE10_CPLX_STORE (Fout + 4 * out_step, out[4]);
1183  NE10_CPLX_STORE (Fout + 5 * out_step, out[5]);
1184  NE10_CPLX_STORE (Fout + 6 * out_step, out[6]);
1185  NE10_CPLX_STORE (Fout + 7 * out_step, out[7]);
1186 
1187  Fin++;
1188  Fout += 8;
1189  }
1190  }
1191 }
1192 
1193 template<ne10_int32_t is_inverse, bool is_scaled>
1194 static void ne10_mixed_radix_generic_butterfly_float32_neon_impl (CPLX *Fout,
1195  const CPLX *Fin,
1196  const ne10_int32_t *factors,
1197  const ne10_fft_cpx_float32_t *twiddles,
1198  CPLX *buffer)
1199 {
1200  ne10_int32_t fstride, mstride, radix;
1201  ne10_int32_t stage_count;
1202  ne10_int32_t nfft;
1203 
1204  // init fstride, mstride, radix, nfft
1205  stage_count = factors[0];
1206  fstride = factors[1];
1207  mstride = 1;
1208  radix = factors[ stage_count << 1 ]; // radix of first stage
1209  nfft = fstride * radix;
1210 
1211  // swap to make sure output to Fout
1212  if (stage_count % 2 == 0)
1213  {
1214  ne10_swap_ptr (buffer, Fout);
1215  }
1216 
1217  // first stage
1218  switch (radix)
1219  {
1220  case 2:
1221  ne10_radix_2_butterfly_float32_neon<1, is_inverse, is_scaled> (Fout, Fin, NULL,
1222  fstride, 1, nfft);
1223  break;
1224  case 4:
1225  ne10_radix_4_butterfly_float32_neon<1, is_inverse, is_scaled> (Fout, Fin, NULL,
1226  fstride, 1, nfft);
1227  break;
1228  case 3:
1229  ne10_radix_3_butterfly_float32_neon<1, is_inverse, is_scaled> (Fout, Fin, NULL,
1230  fstride, 1, nfft);
1231  break;
1232  case 5:
1233  ne10_radix_5_butterfly_float32_neon<1, is_inverse, is_scaled> (Fout, Fin, NULL,
1234  fstride, 1, nfft);
1235  break;
1236  case 8:
1237  ne10_radix_8_butterfly_float32_neon<1, is_inverse, is_scaled> (Fout, Fin, NULL,
1238  fstride, 1, nfft);
1239  break;
1240  }
1241 
1242  stage_count--;
1243  if (! stage_count) // finish
1244  {
1245  return;
1246  }
1247 
1248  mstride *= radix;
1249 
1250  // update radix
1251  if (radix % 2)
1252  {
1253  twiddles += radix;
1254  }
1255  radix = factors[ stage_count << 1 ];
1256 
1257  // other stages
1258  while (stage_count > 0)
1259  {
1260 
1261  // radix of first stage, should be one of {2,3,5,4}
1262  assert ((radix > 1) && (radix < 6));
1263 
1264  ne10_swap_ptr (buffer, Fout);
1265 
1266  fstride /= radix;
1267  switch (radix)
1268  {
1269  case 2:
1270  ne10_radix_2_butterfly_float32_neon<0, is_inverse, false> (Fout, buffer,
1271  twiddles, fstride, mstride, nfft);
1272  break;
1273  case 3:
1274  ne10_radix_3_butterfly_float32_neon<0, is_inverse, false> (Fout, buffer,
1275  twiddles, fstride, mstride, nfft);
1276  break;
1277  case 4:
1278  ne10_radix_4_butterfly_float32_neon<0, is_inverse, false> (Fout, buffer,
1279  twiddles, fstride, mstride, nfft);
1280  break;
1281  case 5:
1282  ne10_radix_5_butterfly_float32_neon<0, is_inverse, false> (Fout, buffer,
1283  twiddles, fstride, mstride, nfft);
1284  break;
1285  } // switch (radix)
1286 
1287  twiddles += mstride * (radix - 1);
1288  mstride *= radix;
1289 
1290  stage_count--;
1291  radix = factors[ stage_count << 1 ];
1292  } // while (stage_count)
1293 }
1294 
1295 template<ne10_int32_t is_inverse>
1296 static void ne10_c2c_1d_last_stage_neon (CPLX *Fout,
1297  const CPLX *Fin,
1298  const ne10_fft_cpx_float32_t *twiddles,
1299  const ne10_int32_t fstride,
1300  const ne10_int32_t out_step,
1301  const ne10_int32_t)
1302 {
1303  ne10_int32_t f_count;
1304  ne10_int32_t m_count;
1305 
1306  for (f_count = fstride; f_count > 0; f_count--)
1307  {
1308  CPLX scratch_in[4];
1309  CPLX scratch_out[4];
1310  CPLX scratch[4];
1311 
1312  for (m_count = out_step / NE10_FFT_PARA_LEVEL; m_count > 0; m_count--)
1313  {
1314 #ifndef NE10_INLINE_ASM_OPT
1315  scratch_in[0] = NE10_CPLX_LOAD (Fin + 0);
1316  scratch_in[1] = NE10_CPLX_LOAD (Fin + 1);
1317  scratch_in[2] = NE10_CPLX_LOAD (Fin + 2);
1318  scratch_in[3] = NE10_CPLX_LOAD (Fin + 3);
1319 
1320  // Transpose
1321  {
1322  CPLX scratch0, scratch_in0;
1323  CPLX scratch1, scratch_in1;
1324  CPLX scratch2, scratch_in2;
1325  CPLX scratch3, scratch_in3;
1326 
1327  scratch_in0 = scratch_in[0];
1328  scratch_in1 = scratch_in[1];
1329  scratch_in2 = scratch_in[2];
1330  scratch_in3 = scratch_in[3];
1331 
1332  NE10_RADIX4X4C_TRANSPOSE_NEON (scratch, scratch_in);
1333 
1334  scratch_in[0] = scratch0;
1335  scratch_in[1] = scratch1;
1336  scratch_in[2] = scratch2;
1337  scratch_in[3] = scratch3;
1338  }
1339 #else // NE10_INLINE_ASM_OPT
1340 #ifndef __aarch64__
1341 #error Currently, inline assembly optimizations are only available on AArch64.
1342 #else // __aarch64__
1343  const float *pin = (const float *) Fin;
1344  asm volatile (
1345  "ld2 {v0.4s, v1.4s}, [%[pin]], %[offset] \n\t"
1346  "ld2 {v2.4s, v3.4s}, [%[pin]], %[offset] \n\t"
1347  "ld2 {v4.4s, v5.4s}, [%[pin]], %[offset] \n\t"
1348  "ld2 {v6.4s, v7.4s}, [%[pin]] \n\t"
1349 
1350  // NE10_RADIX4X4C_TRANSPOSE_NEON (q2_in,q2_out);
1351  "trn1 v8.4s, v0.4s, v2.4s \n\t"
1352  "trn2 v9.4s, v0.4s, v2.4s \n\t"
1353  "trn1 v10.4s, v4.4s, v6.4s \n\t"
1354  "trn2 v11.4s, v4.4s, v6.4s \n\t"
1355 
1356  "trn1 %[in0r].2d, v8.2d, v10.2d \n\t"
1357  "trn1 %[in1r].2d, v9.2d, v11.2d \n\t"
1358  "trn2 %[in2r].2d, v8.2d, v10.2d \n\t"
1359  "trn2 %[in3r].2d, v9.2d, v11.2d \n\t"
1360 
1361  "trn1 v8.4s, v1.4s, v3.4s \n\t"
1362  "trn2 v9.4s, v1.4s, v3.4s \n\t"
1363  "trn1 v10.4s, v5.4s, v7.4s \n\t"
1364  "trn2 v11.4s, v5.4s, v7.4s \n\t"
1365 
1366  "trn1 %[in0i].2d, v8.2d, v10.2d \n\t"
1367  "trn1 %[in1i].2d, v9.2d, v11.2d \n\t"
1368  "trn2 %[in2i].2d, v8.2d, v10.2d \n\t"
1369  "trn2 %[in3i].2d, v9.2d, v11.2d \n\t"
1370  : [in0r]"+w"(scratch_in[0].val[0]),
1371  [in0i]"+w"(scratch_in[0].val[1]),
1372  [in1r]"+w"(scratch_in[1].val[0]),
1373  [in1i]"+w"(scratch_in[1].val[1]),
1374  [in2r]"+w"(scratch_in[2].val[0]),
1375  [in2i]"+w"(scratch_in[2].val[1]),
1376  [in3r]"+w"(scratch_in[3].val[0]),
1377  [in3i]"+w"(scratch_in[3].val[1]),
1378  [pin]"+r"(pin)
1379  : [offset]"r"(32)
1380  : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1381  "v8", "v9", "v10", "v11");
1382 #endif // __aarch64__
1383 #endif // NE10_INLINE_ASM_OPT
1384 
1385  if (is_inverse)
1386  {
1387  NE10_FFT4_CONJ (scratch_in);
1388  }
1389 
1390  // Not first stage
1391  {
1392 #ifndef NE10_INLINE_ASM_OPT
1393  CPLX scratch_tw[3];
1394 
1395  scratch_tw[0] = NE10_CPLX_LOAD (twiddles + 0 * out_step);
1396  scratch_tw[1] = NE10_CPLX_LOAD (twiddles + 1 * out_step);
1397  scratch_tw[2] = NE10_CPLX_LOAD (twiddles + 2 * out_step);
1398 
1399  NE10_FFT4_MUL_TW_NEON (scratch_in, scratch_in, scratch_tw);
1400 #else // NE10_INLINE_ASM_OPT
1401 #ifndef __aarch64__
1402 #error Currently, inline assembly optimizations are only available on AArch64.
1403 #else // __aarch64__
1404  const float *tw = (const float *)twiddles;
1405  asm volatile (
1406  "ld2 {v0.4s, v1.4s}, [%[tw]], %[offset] \n\t"
1407  "ld2 {v2.4s, v3.4s}, [%[tw]], %[offset] \n\t"
1408  "ld2 {v4.4s, v5.4s}, [%[tw]] \n\t"
1409 
1410  "fmul v6.4s, %[in1r].4s, v1.4s \n\t" // RI
1411  "fmul %[in1r].4s, %[in1r].4s, v0.4s \n\t" // RR
1412  "fmls %[in1r].4s, %[in1i].4s, v1.4s \n\t" // RR - II
1413  "fmul v7.4s, %[in1i].4s, v0.4s \n\t" // IR
1414  "fadd %[in1i].4s, v6.4s, v7.4s \n\t" // RI + IR
1415 
1416  "fmul v6.4s, %[in2r].4s, v3.4s \n\t" // RI
1417  "fmul %[in2r].4s, %[in2r].4s, v2.4s \n\t" // RR
1418  "fmls %[in2r].4s, %[in2i].4s, v3.4s \n\t" // RR - II
1419  "fmul v7.4s, %[in2i].4s, v2.4s \n\t" // IR
1420  "fadd %[in2i].4s, v6.4s, v7.4s \n\t" // RI + IR
1421 
1422  "fmul v6.4s, %[in3r].4s, v5.4s \n\t" // RI
1423  "fmul %[in3r].4s, %[in3r].4s, v4.4s \n\t" // RR
1424  "fmls %[in3r].4s, %[in3i].4s, v5.4s \n\t" // RR - II
1425  "fmul v7.4s, %[in3i].4s, v4.4s \n\t" // IR
1426  "fadd %[in3i].4s, v6.4s, v7.4s \n\t" // RI + IR
1427  : [tw]"+r"(tw),
1428  [in1r]"+w"(scratch_in[1].val[0]),
1429  [in1i]"+w"(scratch_in[1].val[1]),
1430  [in2r]"+w"(scratch_in[2].val[0]),
1431  [in2i]"+w"(scratch_in[2].val[1]),
1432  [in3r]"+w"(scratch_in[3].val[0]),
1433  [in3i]"+w"(scratch_in[3].val[1])
1434  : [offset]"r"(out_step * 8)
1435  : "memory", "v0", "v1", "v2", "v3", "v4", "v5",
1436  "v6", "v7");
1437 #endif // __aarch64__
1438 #endif // NE10_INLINE_ASM_OPT
1439 
1440  }
1441 
1442  NE10_CPX_ADD_NEON_F32 (scratch[0], scratch_in[0], scratch_in[2]);
1443  NE10_CPX_SUB_NEON_F32 (scratch[1], scratch_in[0], scratch_in[2]);
1444  NE10_CPX_ADD_NEON_F32 (scratch[2], scratch_in[1], scratch_in[3]);
1445  NE10_CPX_SUB_NEON_F32 (scratch[3], scratch_in[1], scratch_in[3]);
1446 
1447 #ifndef NE10_INLINE_ASM_OPT
1448  NE10_CPX_ADD_NEON_F32 (scratch_out[0], scratch[0], scratch[2]);
1449  NE10_CPX_SUB_NEON_F32 (scratch_out[2], scratch[0], scratch[2]);
1450 
1451  scratch_out[1].val[0] = scratch[1].val[0] + scratch[3].val[1];
1452  scratch_out[1].val[1] = scratch[1].val[1] - scratch[3].val[0];
1453  scratch_out[3].val[0] = scratch[1].val[0] - scratch[3].val[1];
1454  scratch_out[3].val[1] = scratch[1].val[1] + scratch[3].val[0];
1455 
1456  if (is_inverse == 1)
1457  {
1458  NE10_FFT4_CONJ (scratch_out);
1459  }
1460 
1461  // Store.
1462  {
1463  ne10_fft_cpx_float32_t *Fout_cpx;
1464  Fout_cpx = (ne10_fft_cpx_float32_t *) Fout;
1465 
1466  NE10_CPLX_STORE (Fout_cpx + 0 * out_step, scratch_out[0]);
1467  NE10_CPLX_STORE (Fout_cpx + 1 * out_step, scratch_out[1]);
1468  NE10_CPLX_STORE (Fout_cpx + 2 * out_step, scratch_out[2]);
1469  NE10_CPLX_STORE (Fout_cpx + 3 * out_step, scratch_out[3]);
1470  }
1471 #else // NE10_INLINE_ASM_OPT
1472 #ifndef __aarch64__
1473 #error Currently, inline assembly optimizations are only available on AArch64.
1474 #else // __aarch64__
1475  asm volatile (
1476  "fadd v0.4s, %[s0r].4s, %[s2r].4s \n\t"
1477  "fadd v1.4s, %[s0i].4s, %[s2i].4s \n\t"
1478  "fsub v4.4s, %[s0r].4s, %[s2r].4s \n\t"
1479  "fsub v5.4s, %[s0i].4s, %[s2i].4s \n\t"
1480  "fadd v2.4s, %[s1r].4s, %[s3i].4s \n\t"
1481  "fsub v3.4s, %[s1i].4s, %[s3r].4s \n\t"
1482  "fsub v6.4s, %[s1r].4s, %[s3i].4s \n\t"
1483  "fadd v7.4s, %[s1i].4s, %[s3r].4s \n\t"
1484  :
1485  : [s0r]"w"(scratch[0].val[0]),
1486  [s0i]"w"(scratch[0].val[1]),
1487  [s1r]"w"(scratch[1].val[0]),
1488  [s1i]"w"(scratch[1].val[1]),
1489  [s2r]"w"(scratch[2].val[0]),
1490  [s2i]"w"(scratch[2].val[1]),
1491  [s3r]"w"(scratch[3].val[0]),
1492  [s3i]"w"(scratch[3].val[1])
1493  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
1494 
1495  if (is_inverse == 1)
1496  {
1497  asm volatile (
1498  "fneg v1.4s, v1.4s \n\t"
1499  "fneg v3.4s, v3.4s \n\t"
1500  "fneg v5.4s, v5.4s \n\t"
1501  "fneg v7.4s, v7.4s \n\t"
1502  :
1503  :
1504  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
1505  }
1506 
1507  float *pout = (float *) Fout;
1508  asm volatile (
1509  "st2 {v0.4s, v1.4s}, [%[pout]], %[offset] \n\t"
1510  "st2 {v2.4s, v3.4s}, [%[pout]], %[offset] \n\t"
1511  "st2 {v4.4s, v5.4s}, [%[pout]], %[offset] \n\t"
1512  "st2 {v6.4s, v7.4s}, [%[pout]] \n\t"
1513  : [pout]"+r"(pout)
1514  : [offset]"r"(out_step * 8)
1515  : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
1516 #endif // __aarch64__
1517 #endif // NE10_INLINE_ASM_OPT
1518  Fin += 4;
1519  Fout += 1;
1520  twiddles += 4;
1521  }
1522  }
1523 
1524  ne10_int32_t left_over = out_step % 4;
1525  if (left_over == 0)
1526  {
1527  return;
1528  }
1529 
1530  // Left over.
1531  const ne10_fft_cpx_float32_t *Fin_s = (ne10_fft_cpx_float32_t *) Fin;
1533  for (m_count = out_step % 4; m_count > 0; m_count--)
1534  {
1535  ne10_fft_cpx_float32_t scratch_in[4];
1536  ne10_fft_cpx_float32_t scratch_tw[4];
1537 
1538  scratch_in[0] = Fin_s[0];
1539  scratch_in[1] = Fin_s[1];
1540  scratch_in[2] = Fin_s[2];
1541  scratch_in[3] = Fin_s[3];
1542 
1543  if (is_inverse)
1544  {
1545  scratch_in[0].i = -scratch_in[0].i;
1546  scratch_in[1].i = -scratch_in[1].i;
1547  scratch_in[2].i = -scratch_in[2].i;
1548  scratch_in[3].i = -scratch_in[3].i;
1549  }
1550 
1551  scratch_tw[0] = twiddles[0 * out_step];
1552  scratch_tw[1] = twiddles[1 * out_step];
1553  scratch_tw[2] = twiddles[2 * out_step];
1554 
1555  FFT4_MUL_TW (scratch_in, scratch_in, scratch_tw);
1556 
1557  FFT4_FCU_INPLACE (scratch_in);
1558 
1559  if (is_inverse)
1560  {
1561  scratch_in[0].i = -scratch_in[0].i;
1562  scratch_in[1].i = -scratch_in[1].i;
1563  scratch_in[2].i = -scratch_in[2].i;
1564  scratch_in[3].i = -scratch_in[3].i;
1565  }
1566 
1567  Fout_s[0 * out_step] = scratch_in[0];
1568  Fout_s[1 * out_step] = scratch_in[1];
1569  Fout_s[2 * out_step] = scratch_in[2];
1570  Fout_s[3 * out_step] = scratch_in[3];
1571 
1572  Fin_s += 4;
1573  Fout_s += 1;
1574  twiddles += 1;
1575  }
1576 }
1577 
1578 typedef void (*NE10_MIXED_RADIX_FUNC) (CPLX*, const CPLX *, const ne10_int32_t *,
1579  const ne10_fft_cpx_float32_t *, CPLX *);
1580 
1582  ne10_fft_cpx_float32_t *Fout,
1583  const ne10_fft_cpx_float32_t *Fin,
1584  const ne10_int32_t *factors,
1585  const ne10_fft_cpx_float32_t *twiddles,
1586  ne10_fft_cpx_float32_t *buffer,
1587  const ne10_int32_t is_scaled)
1588 {
1589  ne10_int32_t stage_count = factors[0];
1590  ne10_int32_t fstride = factors[1];
1591  ne10_int32_t radix = factors[stage_count << 1]; // radix of first stage
1592 
1593  NE10_MIXED_RADIX_FUNC ne10_mixed_radix_impl = NULL;
1594 
1595  // nfft below is not the actual length of FFT, it is 1/4 of the actual one
1596  // instead.
1597  ne10_int32_t nfft = fstride * radix;
1598 
1599  if (is_scaled)
1600  {
1601  ne10_mixed_radix_impl = ne10_mixed_radix_generic_butterfly_float32_neon_impl<0, true>;
1602  }
1603  else
1604  {
1605  ne10_mixed_radix_impl = ne10_mixed_radix_generic_butterfly_float32_neon_impl<0, false>;
1606  }
1607 
1608  ne10_mixed_radix_impl ((CPLX *) buffer,
1609  (const CPLX *) Fin, // From Fin to buffer
1610  factors,
1611  twiddles,
1612  (CPLX *) Fout); // Fout is "buffer" for these stages.
1613 
1614  ne10_c2c_1d_last_stage_neon<0> ((CPLX *) Fout,
1615  (const CPLX *) buffer, // From buffer to Fout
1616  twiddles + nfft,
1617  1, // out_step == fstride == 1
1618  nfft, // in_step == mstride == nfft
1619  nfft * 4); // Actual length of FFT
1620 }
1621 
1623  ne10_fft_cpx_float32_t *Fout,
1624  const ne10_fft_cpx_float32_t *Fin,
1625  const ne10_int32_t *factors,
1626  const ne10_fft_cpx_float32_t *twiddles,
1627  ne10_fft_cpx_float32_t *buffer,
1628  const ne10_int32_t is_scaled)
1629 {
1630  ne10_int32_t stage_count = factors[0];
1631  ne10_int32_t fstride = factors[1];
1632  ne10_int32_t radix = factors[stage_count << 1]; // radix of first stage
1633 
1634  NE10_MIXED_RADIX_FUNC ne10_mixed_radix_impl = NULL;
1635 
1636  // nfft below is not the actual length of FFT, it is 1/4 of the actual one
1637  // instead.
1638  ne10_int32_t nfft = fstride * radix;
1639 
1640  if (is_scaled)
1641  {
1642  ne10_mixed_radix_impl = ne10_mixed_radix_generic_butterfly_float32_neon_impl<1, true>;
1643  }
1644  else
1645  {
1646  ne10_mixed_radix_impl = ne10_mixed_radix_generic_butterfly_float32_neon_impl<1, false>;
1647  }
1648 
1649  ne10_mixed_radix_impl ((CPLX *) buffer,
1650  (const CPLX *) Fin, // From Fin to buffer
1651  factors,
1652  twiddles,
1653  (CPLX *) Fout); // Fout is "buffer" for these stages.
1654 
1655  ne10_c2c_1d_last_stage_neon<1> ((CPLX *) Fout,
1656  (const CPLX *) buffer, // From buffer to Fout
1657  twiddles + nfft,
1658  1, // out_step == fstride == 1
1659  nfft, // in_step == mstride == nfft
1660  nfft * 4); // Actual length of FFT
1661 }
#define NE10_FFT_PARA_LEVEL
Definition: NE10_fft.h:79
int32_t ne10_int32_t
Definition: NE10_types.h:76
#define ne10_swap_ptr(X, Y)
void ne10_mixed_radix_generic_butterfly_float32_neon(ne10_fft_cpx_float32_t *Fout, const ne10_fft_cpx_float32_t *Fin, const ne10_int32_t *factors, const ne10_fft_cpx_float32_t *twiddles, ne10_fft_cpx_float32_t *buffer, const ne10_int32_t is_scaled)
float ne10_float32_t
Definition: NE10_types.h:80
#define NE10_BUTTERFLY_INDEX_NEON_F32(OUT, IN, OUT_I, OUT_J, IN_I, IN_J)
#define NE10_CPLX_STORE(PTR, OUT)
void NE10_LOAD_TW_AND_MUL(CPLX scratch_in[RADIX], const ne10_fft_cpx_int32_t *ptr_in, const ne10_int32_t step)
#define NE10_CPX_ADD_NEON_F32(Z, A, B)
#define NE10_RADIX4X4C_TRANSPOSE_NEON(Q2_OUT, Q2_IN)
#define NE10_CPX_MUL_TW8_NEON_F32(OUT, TW_8_TABLE, OUT_I, TW_J)
void(* NE10_MIXED_RADIX_FUNC)(CPLX *, const CPLX *, const ne10_int32_t *, const ne10_fft_cpx_float32_t *, CPLX *)
#define NE10_CPX_SUB_NEON_F32(Z, A, B)
void ne10_mixed_radix_generic_butterfly_inverse_float32_neon(ne10_fft_cpx_float32_t *Fout, const ne10_fft_cpx_float32_t *Fin, const ne10_int32_t *factors, const ne10_fft_cpx_float32_t *twiddles, ne10_fft_cpx_float32_t *buffer, const ne10_int32_t is_scaled)
ne10_float32_t i
Definition: NE10_types.h:233
#define NE10_CPX_MUL_NEON_F32(Z, A, B)