Project Ne10
An open, optimized software library for the ARM architecture.
NE10_rfft_float32.neonintrinsic.c
Go to the documentation of this file.
1 /*
2  * Copyright 2014-16 ARM Limited and Contributors.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of ARM Limited nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /* license of Kiss FFT */
29 /*
30 Copyright (c) 2003-2010, Mark Borgerding
31 
32 All rights reserved.
33 
34 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
35 
36  * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
37  * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
38  * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
39 
40 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 */
42 
43 /*
44  * NE10 Library : dsp/NE10_rfft_float32.neonintrinsic.c
45  */
46 
47 #include <arm_neon.h>
48 
49 #include "NE10_types.h"
50 #include "NE10_macros.h"
51 #include "NE10_fft.h"
52 #include "NE10_dsp.h"
53 #include "NE10_fft.neonintrinsic.h"
54 
56  const ne10_fft_cpx_float32_t *Fin,
57  const ne10_int32_t fstride,
58  const ne10_int32_t mstride,
59  const ne10_int32_t nfft)
60 {
61  ne10_int32_t f_count;
62 
63  NE10_DECLARE_8(float32x4_t,q_in);
64  NE10_DECLARE_8(float32x4_t,q_out);
65 
66  const float32x4_t *Fin_neon = (float32x4_t*) Fin; // 8 x fstride
67  float32x4_t *Fout_neon = (float32x4_t*) Fout; // fstride x 8
68 
69  for (f_count = fstride; f_count > 0; f_count --)
70  {
71  // from Fin_neon load 8 float32x4_t into q_in0 ~ q_in7, by step = fstride
72  NE10_RADIX8x4_R2C_NEON_LOAD(Fin_neon,q_in,fstride);
73 
74  // print q_in0 ~ q_in7
75  // NE10_PRINT_Qx8_VECTOR(q_in);
76 
77  // do r2c fft, size = 8
79 
80  // print q_out0 ~ q_out7
81  // NE10_PRINT_Qx8_VECTOR(q_out);
82 
83  // store q_out0 ~ q_out7 to Fout_neon, by step = 1
84  NE10_RADIX8x4_R2C_NEON_STORE(Fout_neon,q_out,1);
85 
86  Fin_neon = Fin_neon - fstride * 8 + 1;
87  Fout_neon += 8; // next column
88  }
89 }
90 
92  const ne10_fft_cpx_float32_t *Fin,
93  const ne10_int32_t fstride,
94  const ne10_int32_t mstride,
95  const ne10_int32_t nfft)
96 {
97  ne10_int32_t f_count;
98 
99  NE10_DECLARE_8(float32x4_t,q_in);
100  NE10_DECLARE_8(float32x4_t,q_out);
101 
102  const ne10_float32_t one_by_N = 0.25 / nfft;
103  const float32x4_t one_by_N_neon = vdupq_n_f32(one_by_N);
104 
105  const float32x4_t *Fin_neon = (float32x4_t*) Fin;
106  float32x4_t *Fout_neon = (float32x4_t*) Fout;
107 
108  for (f_count = fstride; f_count > 0; f_count --)
109  {
110  // from Fin_neon load 8 float32x4_t into q_in0 ~ q_in7, by step = 1
111  NE10_RADIX8x4_R2C_NEON_LOAD(Fin_neon,q_in,1);
112 
113  // NE10_PRINT_Qx8_VECTOR(q_in);
114 
115  NE10_RADIX8x4_C2R_NEON_KERNEL(q_out,q_in);
116 
117  // NE10_PRINT_Qx8_VECTOR(q_out);
118 
119 #ifdef NE10_DSP_RFFT_SCALING
120  q_out0 = vmulq_f32(q_out0,one_by_N_neon);
121  q_out1 = vmulq_f32(q_out1,one_by_N_neon);
122  q_out2 = vmulq_f32(q_out2,one_by_N_neon);
123  q_out3 = vmulq_f32(q_out3,one_by_N_neon);
124  q_out4 = vmulq_f32(q_out4,one_by_N_neon);
125  q_out5 = vmulq_f32(q_out5,one_by_N_neon);
126  q_out6 = vmulq_f32(q_out6,one_by_N_neon);
127  q_out7 = vmulq_f32(q_out7,one_by_N_neon);
128 #endif
129 
130  // store
131  NE10_RADIX8x4_R2C_NEON_STORE(Fout_neon,q_out,fstride);
132 
133  Fout_neon ++;
134  }
135 }
136 
138  const ne10_fft_cpx_float32_t *Fin,
139  const ne10_int32_t fstride,
140  const ne10_int32_t mstride,
141  const ne10_int32_t nfft)
142 {
143  ne10_int32_t f_count;
144 
145  const float32x4_t *Fin_neon = (float32x4_t*) Fin;
146  float32x4_t *Fout_neon = (float32x4_t*) Fout;
147 
148  for (f_count = 0; f_count < fstride; f_count ++)
149  {
150  NE10_DECLARE_4(float32x4_t,q_in);
151  NE10_DECLARE_4(float32x4_t,q_out);
152 
153  // load
154  NE10_RADIX4x4_R2C_NEON_LOAD(Fin_neon,q_in,fstride);
155 
157 
158  // store
159  NE10_RADIX4x4_R2C_NEON_STORE(Fout_neon,q_out,1);
160 
161  Fin_neon = Fin_neon - 4*fstride + 1;
162  Fout_neon += 4;
163  }
164 }
165 
167  const ne10_fft_cpx_float32_t *Fin,
168  const ne10_int32_t fstride,
169  const ne10_int32_t mstride,
170  const ne10_int32_t nfft)
171 {
172  ne10_int32_t f_count;
173 
174  const float32x4_t *Fin_neon = (float32x4_t*) Fin;
175  float32x4_t *Fout_neon = (float32x4_t*) Fout;
176 
177  const ne10_float32_t one_by_N = 0.25 / nfft;
178  const float32x4_t one_by_N_neon = vdupq_n_f32(one_by_N);
179 
180  for (f_count = 0; f_count < fstride; f_count ++)
181  {
182  NE10_DECLARE_4(float32x4_t,q_in);
183  NE10_DECLARE_4(float32x4_t,q_out);
184 
185  // load
186  NE10_RADIX4x4_R2C_NEON_LOAD(Fin_neon,q_in,1);
187 
188  // NE10_PRINT_Qx4_VECTOR(q_in);
189 
191 
192  // NE10_PRINT_Qx4_VECTOR(q_out);
193 
194 #ifdef NE10_DSP_RFFT_SCALING
195  q_out0 = vmulq_f32(q_out0,one_by_N_neon);
196  q_out1 = vmulq_f32(q_out1,one_by_N_neon);
197  q_out2 = vmulq_f32(q_out2,one_by_N_neon);
198  q_out3 = vmulq_f32(q_out3,one_by_N_neon);
199 #endif
200 
201  // store
202  NE10_RADIX4x4_R2C_NEON_STORE(Fout_neon,q_out,fstride);
203 
204  Fout_neon ++;
205  }
206 }
207 
209  const float32x4_t *Fin_neon,
210  const ne10_int32_t out_step,
211  const ne10_int32_t in_step,
212  const ne10_fft_cpx_float32_t *twiddles)
213 {
214  NE10_DECLARE_4(float32x4_t,q_in);
215  NE10_DECLARE_4(float32x4_t,q_out);
216 
217  // load
218  NE10_RADIX4x4_R2C_NEON_LOAD(Fin_neon,q_in,in_step);
219 
220  NE10_RADIX4x4_R2C_NEON_KERNEL(q_out,q_in);
221 
222  // store
223  vst1q_f32( (ne10_float32_t*) (Fout_neon ), q_out0);
224  vst1q_f32( (ne10_float32_t*) (Fout_neon + (out_step << 1) - 1), q_out1);
225  vst1q_f32( (ne10_float32_t*) (Fout_neon + (out_step << 1) ), q_out2);
226  vst1q_f32( (ne10_float32_t*) (Fout_neon + 2 * (out_step << 1) - 1), q_out3);
227 }
228 
230  const float32x4_t *Fin_neon,
231  const ne10_int32_t out_step,
232  const ne10_int32_t in_step,
233  const ne10_fft_cpx_float32_t *twiddles)
234 {
235  NE10_DECLARE_4(float32x4_t,q_in);
236  NE10_DECLARE_4(float32x4_t,q_out);
237 
238  // load
239  q_in0 = vld1q_f32( (ne10_float32_t*) (Fin_neon ) );
240  q_in1 = vld1q_f32( (ne10_float32_t*) (Fin_neon + (out_step << 1) - 1) );
241  q_in2 = vld1q_f32( (ne10_float32_t*) (Fin_neon + (out_step << 1) ) );
242  q_in3 = vld1q_f32( (ne10_float32_t*) (Fin_neon + 2 * (out_step << 1) - 1) );
243 
244  // NE10_PRINT_Qx4_VECTOR(q_in);
245 
246  NE10_RADIX4x4_C2R_NEON_KERNEL(q_out,q_in);
247 
248  // NE10_PRINT_Qx4_VECTOR(q_out);
249 
250  // store
251  NE10_RADIX4x4_R2C_NEON_STORE(Fout_neon,q_out,in_step);
252 }
253 
255  const float32x4_t *Fin_neon,
256  const ne10_int32_t out_step,
257  const ne10_int32_t in_step,
258  const ne10_fft_cpx_float32_t *twiddles)
259 {
260  ne10_int32_t m_count;
261  ne10_int32_t loop_count = (out_step>>1) -1;
262  float32x4_t *Fout_b = Fout_neon + (((out_step<<1)-1)<<1) - 2; // reversed
263 
264  NE10_DECLARE_3(float32x4x2_t,q2_tw);
265  NE10_DECLARE_4(float32x4x2_t,q2_in);
266  NE10_DECLARE_4(float32x4x2_t,q2_out);
267 
268  for (m_count = loop_count; m_count > 0; m_count -- )
269  {
270 #ifndef NE10_INLINE_ASM_OPT
271  // load
272  q2_in0.val[0] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 0*in_step ) );
273  q2_in0.val[1] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 0*in_step + 1) );
274 
275  q2_in1.val[0] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 1*in_step ) );
276  q2_in1.val[1] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 1*in_step + 1) );
277 
278  q2_in2.val[0] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 2*in_step ) );
279  q2_in2.val[1] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 2*in_step + 1) );
280 
281  q2_in3.val[0] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 3*in_step ) );
282  q2_in3.val[1] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 3*in_step + 1) );
283 
284  q2_tw0.val[0] = vdupq_n_f32(twiddles[0].r);
285  q2_tw0.val[1] = vdupq_n_f32(twiddles[0].i);
286 
287  q2_tw1.val[0] = vdupq_n_f32(twiddles[1].r);
288  q2_tw1.val[1] = vdupq_n_f32(twiddles[1].i);
289 
290  q2_tw2.val[0] = vdupq_n_f32(twiddles[2].r);
291  q2_tw2.val[1] = vdupq_n_f32(twiddles[2].i);
292 
293  // R2C TW KERNEL
294  NE10_RADIX4x4_R2C_TW_MUL_NEON (q2_out, q2_in, q2_tw);
295 #else // NE10_INLINE_ASM_OPT
296 #ifndef __aarch64__
297 #error Currently, inline assembly optimizations are only available on AArch64.
298 #else // __aarch64__
299  const ne10_float32_t *ptr_inr = ((const ne10_float32_t *) Fin_neon);
300  const ne10_float32_t *ptr_ini = ((const ne10_float32_t *) Fin_neon) + 4;
301  asm volatile (
302  "ld1 {%[q2_out0r].4s}, [%[ptr_inr]], %[offset_in] \n\t"
303  "ld1 {%[q2_out0i].4s}, [%[ptr_ini]] \n\t"
304  "ld1 {v10.4s, v11.4s}, [%[ptr_inr]], %[offset_in] \n\t"
305  "ld1 {v12.4s, v13.4s}, [%[ptr_inr]], %[offset_in] \n\t"
306  "ld1 {v14.4s, v15.4s}, [%[ptr_inr]] \n\t"
307  "ld1 {v0.1d, v1.1d, v2.1d}, [%[ptr_tw]] \n\t"
308 
309  "fmul %[q2_out1r].4s, v10.4s, v0.4s[0] \n\t" // RR
310  "fmul %[q2_out1i].4s, v10.4s, v0.4s[1] \n\t" // RI
311  "fmls %[q2_out1r].4s, v11.4s, v0.4s[1] \n\t" // RR - II
312  "fmla %[q2_out1i].4s, v11.4s, v0.4s[0] \n\t" // RI + IR
313 
314  "fmul %[q2_out2r].4s, v12.4s, v1.4s[0] \n\t" // RR
315  "fmul %[q2_out2i].4s, v12.4s, v1.4s[1] \n\t" // RI
316  "fmls %[q2_out2r].4s, v13.4s, v1.4s[1] \n\t" // RR - II
317  "fmla %[q2_out2i].4s, v13.4s, v1.4s[0] \n\t" // RI + IR
318 
319  "fmul %[q2_out3r].4s, v14.4s, v2.4s[0] \n\t" // RR
320  "fmul %[q2_out3i].4s, v14.4s, v2.4s[1] \n\t" // RI
321  "fmls %[q2_out3r].4s, v15.4s, v2.4s[1] \n\t" // RR - II
322  "fmla %[q2_out3i].4s, v15.4s, v2.4s[0] \n\t" // RI + IR
323  : [q2_out0r]"+w"(q2_out0.val[0]),
324  [q2_out0i]"+w"(q2_out0.val[1]),
325  [q2_out1r]"+w"(q2_out1.val[0]),
326  [q2_out1i]"+w"(q2_out1.val[1]),
327  [q2_out2r]"+w"(q2_out2.val[0]),
328  [q2_out2i]"+w"(q2_out2.val[1]),
329  [q2_out3r]"+w"(q2_out3.val[0]),
330  [q2_out3i]"+w"(q2_out3.val[1]),
331  [ptr_inr]"+r"(ptr_inr),
332  [ptr_ini]"+r"(ptr_ini)
333  : [offset_in]"r"(in_step * 16),
334  [ptr_tw]"r"(twiddles)
335  : "memory", "v0", "v1", "v2",
336  "v10", "v11", "v12", "v13", "v14", "v15"
337  );
338 #endif // __aarch64__
339 #endif // NE10_INLINE_ASM_OPT
340 
341  NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S1 (q2_in, q2_out);
342  NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S2 (q2_out, q2_in);
343 
344  // store
345  vst1q_f32( (ne10_float32_t*) ( Fout_neon ), q2_out0.val[0] );
346  vst1q_f32( (ne10_float32_t*) ( Fout_neon + 1), q2_out0.val[1] );
347 
348  vst1q_f32( (ne10_float32_t*) ( Fout_neon + (out_step << 1) ), q2_out1.val[0] );
349  vst1q_f32( (ne10_float32_t*) ( Fout_neon + (out_step << 1) + 1), q2_out1.val[1] );
350 
351  vst1q_f32( (ne10_float32_t*) ( Fout_b ), q2_out2.val[0] );
352  vst1q_f32( (ne10_float32_t*) ( Fout_b + 1), q2_out2.val[1] );
353 
354  vst1q_f32( (ne10_float32_t*) ( Fout_b - (out_step << 1) ), q2_out3.val[0] );
355  vst1q_f32( (ne10_float32_t*) ( Fout_b - (out_step << 1) + 1), q2_out3.val[1] );
356 
357  // update pointers
358  Fin_neon += 2;
359  Fout_neon += 2;
360  Fout_b -= 2;
361  twiddles += 3;
362  }
363 }
364 
366  const float32x4_t *Fin_neon,
367  const ne10_int32_t out_step,
368  const ne10_int32_t in_step,
369  const ne10_fft_cpx_float32_t *twiddles)
370 {
371  ne10_int32_t m_count;
372  ne10_int32_t loop_count = (out_step>>1) -1;
373  const float32x4_t *Fin_b = Fin_neon + (((out_step<<1)-1)<<1) - 2; // reversed
374 
375  NE10_DECLARE_3(float32x4x2_t,q2_tw);
376  NE10_DECLARE_4(float32x4x2_t,q2_in);
377  NE10_DECLARE_4(float32x4x2_t,q2_out);
378 
379  for (m_count = loop_count; m_count > 0; m_count -- )
380  {
381  // load
382  q2_in0.val[0] = vld1q_f32( (ne10_float32_t*) ( Fin_neon ) );
383  q2_in0.val[1] = vld1q_f32( (ne10_float32_t*) ( Fin_neon + 1) );
384 
385  q2_in1.val[0] = vld1q_f32( (ne10_float32_t*) ( Fin_neon + (out_step << 1) ) );
386  q2_in1.val[1] = vld1q_f32( (ne10_float32_t*) ( Fin_neon + (out_step << 1) + 1) );
387 
388  q2_in2.val[0] = vld1q_f32( (ne10_float32_t*) ( Fin_b ) );
389  q2_in2.val[1] = vld1q_f32( (ne10_float32_t*) ( Fin_b + 1) );
390 
391  q2_in3.val[0] = vld1q_f32( (ne10_float32_t*) ( Fin_b - (out_step << 1) ) );
392  q2_in3.val[1] = vld1q_f32( (ne10_float32_t*) ( Fin_b - (out_step << 1) + 1) );
393 
394  q2_tw0.val[0] = vdupq_n_f32(twiddles[0].r);
395  q2_tw0.val[1] = vdupq_n_f32(twiddles[0].i);
396 
397  q2_tw1.val[0] = vdupq_n_f32(twiddles[1].r);
398  q2_tw1.val[1] = vdupq_n_f32(twiddles[1].i);
399 
400  q2_tw2.val[0] = vdupq_n_f32(twiddles[2].r);
401  q2_tw2.val[1] = vdupq_n_f32(twiddles[2].i);
402 
403  // NE10_PRINT_Q2x4_VECTOR(q2_in);
404 
405  // R2C TW KERNEL
406  NE10_RADIX4x4_C2R_TW_NEON_KERNEL(q2_out,q2_in,q2_tw);
407 
408  // NE10_PRINT_Q2x4_VECTOR(q2_out);
409 
410  // store
411  vst1q_f32( (ne10_float32_t*) (Fout_neon + 0*in_step ), q2_out0.val[0] );
412  vst1q_f32( (ne10_float32_t*) (Fout_neon + 0*in_step + 1), q2_out0.val[1] );
413 
414  vst1q_f32( (ne10_float32_t*) (Fout_neon + 1*in_step ), q2_out1.val[0] );
415  vst1q_f32( (ne10_float32_t*) (Fout_neon + 1*in_step + 1), q2_out1.val[1] );
416 
417  vst1q_f32( (ne10_float32_t*) (Fout_neon + 2*in_step ), q2_out2.val[0] );
418  vst1q_f32( (ne10_float32_t*) (Fout_neon + 2*in_step + 1), q2_out2.val[1] );
419 
420  vst1q_f32( (ne10_float32_t*) (Fout_neon + 3*in_step ), q2_out3.val[0] );
421  vst1q_f32( (ne10_float32_t*) (Fout_neon + 3*in_step + 1), q2_out3.val[1] );
422 
423  // update pointers
424  Fin_neon += 2;
425  Fout_neon += 2;
426  Fin_b -= 2;
427  twiddles += 3;
428  }
429 }
430 
432  const float32x4_t *Fin_neon,
433  const ne10_int32_t out_step,
434  const ne10_int32_t in_step,
435  const ne10_fft_cpx_float32_t *twiddles)
436 {
437  NE10_DECLARE_4(float32x4_t,q_in);
438  NE10_DECLARE_4(float32x4_t,q_out);
439 
440  // load
441  NE10_RADIX4x4_R2C_NEON_LOAD(Fin_neon,q_in,in_step);
442 
444 
445  // store
446  vst1q_f32( (ne10_float32_t*) (Fout_neon ), q_out0);
447  vst1q_f32( (ne10_float32_t*) (Fout_neon + 1), q_out1);
448  vst1q_f32( (ne10_float32_t*) (Fout_neon + (out_step << 1) ), q_out2);
449  vst1q_f32( (ne10_float32_t*) (Fout_neon + (out_step << 1) + 1), q_out3);
450 }
451 
453  const float32x4_t *Fin_neon,
454  const ne10_int32_t out_step,
455  const ne10_int32_t in_step,
456  const ne10_fft_cpx_float32_t *twiddles)
457 {
458  NE10_DECLARE_4(float32x4_t,q_in);
459  NE10_DECLARE_4(float32x4_t,q_out);
460 
461  // load
462  q_in0 = vld1q_f32( (ne10_float32_t*) (Fin_neon ) );
463  q_in1 = vld1q_f32( (ne10_float32_t*) (Fin_neon + 1) );
464  q_in2 = vld1q_f32( (ne10_float32_t*) (Fin_neon + (out_step << 1) ) );
465  q_in3 = vld1q_f32( (ne10_float32_t*) (Fin_neon + (out_step << 1) + 1) );
466 
467  // NE10_PRINT_Qx4_VECTOR(q_in);
468 
470 
471  // NE10_PRINT_Qx4_VECTOR(q_out);
472 
473  // store
474  NE10_RADIX4x4_R2C_NEON_STORE(Fout_neon,q_out,in_step);
475 }
476 
478  const ne10_fft_cpx_float32_t *Fin,
479  const ne10_int32_t fstride,
480  const ne10_int32_t mstride,
481  const ne10_int32_t nfft,
482  const ne10_fft_cpx_float32_t *twiddles)
483 {
484  ne10_int32_t f_count;
485  const ne10_int32_t in_step = nfft >> 2;
486  const ne10_int32_t out_step = mstride;
487 
488  const float32x4_t *Fin_neon = (float32x4_t*) Fin;
489  float32x4_t *Fout_neon = (float32x4_t*) Fout;
490  const ne10_fft_cpx_float32_t *tw;
491 
492  for (f_count = fstride; f_count; f_count --)
493  {
494  tw = twiddles + 3;
495 
496  // first butterfly
497  ne10_radix4x4_r2c_with_twiddles_first_butterfly_neon ( Fout_neon, Fin_neon, out_step, in_step, NULL);
498 
499  Fin_neon ++;
500  Fout_neon ++;
501 
502  // other butterfly
503  // Twiddle tables are transposed to avoid memory access by a large stride.
504  ne10_radix4x4_r2c_with_twiddles_other_butterfly_neon ( Fout_neon, Fin_neon, out_step, in_step, tw);
505 
506  // update Fin_r, Fout_r, twiddles
507  Fin_neon += 2 * ( (out_step >> 1) - 1);
508  Fout_neon += 2 * ( (out_step >> 1) - 1);
509 
510  // last butterfly
511  ne10_radix4x4_r2c_with_twiddles_last_butterfly_neon (Fout_neon, Fin_neon, out_step, in_step, NULL);
512  Fin_neon ++;
513  Fout_neon ++;
514 
515  Fout_neon = Fout_neon + 3 * out_step;
516  } // f_count
517 }
518 
520  const ne10_fft_cpx_float32_t *Fin,
521  const ne10_int32_t fstride,
522  const ne10_int32_t mstride,
523  const ne10_int32_t nfft,
524  const ne10_fft_cpx_float32_t *twiddles)
525 {
526  ne10_int32_t f_count;
527  const ne10_int32_t in_step = nfft >> 2;
528  const ne10_int32_t out_step = mstride;
529 
530  const float32x4_t *Fin_neon = (float32x4_t*) Fin;
531  float32x4_t *Fout_neon = (float32x4_t*) Fout;
532  const ne10_fft_cpx_float32_t *tw;
533 
534  for (f_count = fstride; f_count; f_count --)
535  {
536  tw = twiddles + 3;
537 
538  // first butterfly
539  ne10_radix4x4_c2r_with_twiddles_first_butterfly_neon ( Fout_neon, Fin_neon, out_step, in_step, NULL);
540 
541  Fin_neon ++;
542  Fout_neon ++;
543 
544  // other butterfly
545  // Twiddle tables are transposed to avoid memory access by a large stride.
546  ne10_radix4x4_c2r_with_twiddles_other_butterfly_neon ( Fout_neon, Fin_neon, out_step, in_step, tw);
547 
548  // update Fin_r, Fout_r, twiddles
549  Fin_neon += 2 * ( (out_step >> 1) - 1);
550  Fout_neon += 2 * ( (out_step >> 1) - 1);
551 
552  // last butterfly
553  ne10_radix4x4_c2r_with_twiddles_last_butterfly_neon (Fout_neon, Fin_neon, out_step, in_step, NULL);
554  Fin_neon ++;
555  Fout_neon ++;
556 
557  Fin_neon = Fin_neon + 3 * out_step;
558  } // f_count
559 }
560 
562  const ne10_fft_cpx_float32_t * Fin,
563  const ne10_int32_t * factors,
564  const ne10_fft_cpx_float32_t * twiddles,
565  ne10_fft_cpx_float32_t * buffer)
566 {
567  ne10_int32_t fstride, mstride, nfft;
568  ne10_int32_t radix;
569  ne10_int32_t stage_count;
570 
571  // PRINT_STAGE_INFO;
572 
573  // init fstride, mstride, radix, nfft
574  stage_count = factors[0];
575  fstride = factors[1];
576  mstride = factors[ (stage_count << 1) - 1 ];
577  radix = factors[ stage_count << 1 ];
578  nfft = radix * fstride; // not the real nfft
579 
580  // PRINT_STAGE_INFO;
581 
582  if (stage_count % 2 == 1) // since there is another stage outside
583  {
584  ne10_swap_ptr (buffer, Fout);
585  }
586 
587  // the first stage
588  if (radix == 8) // length of FFT is 2^n (n is odd)
589  {
590  ne10_radix8x4_r2c_neon (Fout, Fin, fstride, mstride, nfft);
591  }
592  else if (radix == 4) // length of FFT is 2^n (n is even)
593  {
594  ne10_radix4x4_r2c_neon (Fout, Fin, fstride, mstride, nfft);
595  }
596  // end of first stage
597 
598  // others
599  for (; fstride > 1;)
600  {
601  fstride >>= 2;
602  ne10_swap_ptr (buffer, Fout);
603 
604  ne10_radix4x4_r2c_with_twiddles_neon (Fout, buffer, fstride, mstride, nfft, twiddles);
605  twiddles += 3 * mstride;
606  mstride <<= 2;
607  } // other stage
608 }
609 
611  const ne10_fft_cpx_float32_t * Fin,
612  const ne10_int32_t * factors,
613  const ne10_fft_cpx_float32_t * twiddles,
614  ne10_fft_cpx_float32_t * buffer)
615 {
616  ne10_int32_t fstride, mstride, nfft;
617  ne10_int32_t radix;
618  ne10_int32_t stage_count;
619 
620  // PRINT_STAGE_INFO;
621 
622  // init fstride, mstride, radix, nfft
623  stage_count = factors[0];
624  fstride = factors[1];
625 
626  mstride = factors[ (stage_count << 1) - 1 ];
627  radix = factors[ stage_count << 1 ];
628  nfft = radix * fstride; // not the real nfft
629 
630  // fstride, mstride for last last stage
631  fstride = 1;
632  mstride = nfft >> 2;
633 
634  if (stage_count % 2 == 0)
635  {
636  ne10_swap_ptr(Fout,buffer);
637  }
638 
639  // others but the first stage
640  for (; stage_count > 1;)
641  {
642  twiddles -= 3 * mstride;
643 
644  // PRINT_STAGE_INFO;
645  // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
646  ne10_radix4x4_c2r_with_twiddles_neon (Fout, buffer, fstride, mstride, nfft, twiddles);
647 
648  fstride <<= 2;
649  mstride >>= 2;
650  stage_count --;
651  ne10_swap_ptr (buffer, Fout);
652  }
653 
654  // first stage -- inversed
655  if (radix == 8) // length of FFT is 2^n (n is odd)
656  {
657  // PRINT_STAGE_INFO;
658  // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
659  ne10_radix8x4_c2r_neon (Fout, buffer, fstride, mstride, nfft);
660  }
661  else if (radix == 4) // length of FFT is 2^n (n is even)
662  {
663  // PRINT_STAGE_INFO;
664  // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
665  ne10_radix4x4_c2r_neon (Fout, buffer, fstride, mstride, nfft);
666  }
667 }
668 
670  const ne10_fft_cpx_float32_t *src,
671  const ne10_fft_cpx_float32_t *twiddles,
672  const ne10_int32_t nfft)
673 {
674  // b0
675  {
676  ne10_float32_t q_4r_out[4];
677  const ne10_float32_t *p_src_r = (const ne10_float32_t*) src;
678 
679  NE10_FFT_R2C_4R_RCR(q_4r_out,p_src_r);
680 
681  dst[0].r = q_4r_out[0];
682  dst[0].i = q_4r_out[3];
683  dst += (nfft>>2);
684  dst[0].r = q_4r_out[1];
685  dst[0].i = q_4r_out[2];
686  dst -= (nfft>>2);
687  }
688 
689  // b2
690  {
691  const ne10_float32_t *p_src_r = (const ne10_float32_t*) (src);
692  p_src_r += nfft;
693  p_src_r -= 4;
694 
695  ne10_float32_t q_4r_out[4];
696 
697  NE10_FFT_R2C_4R_CC(q_4r_out,p_src_r);
698 
699  dst += (nfft>>3);
700  dst[0].r = q_4r_out[0];
701  dst[0].i = q_4r_out[1];
702  dst += (nfft>>2);
703  dst[0].r = q_4r_out[2];
704  dst[0].i = q_4r_out[3];
705  dst -= (nfft>>3);
706  dst -= (nfft>>2);
707  }
708 
709  // b1
710  ne10_fft_cpx_float32_t cc_out[4];
711  ne10_fft_cpx_float32_t cc_in [4];
712  const ne10_float32_t *p_src_r = (const ne10_float32_t*) src;
713  p_src_r += 4;
714 
715  cc_out[0].r = *(p_src_r ++);
716  cc_out[1].r = *(p_src_r ++);
717  cc_out[2].r = *(p_src_r ++);
718  cc_out[3].r = *(p_src_r ++);
719 
720  cc_out[0].i = *(p_src_r ++);
721  cc_out[1].i = *(p_src_r ++);
722  cc_out[2].i = *(p_src_r ++);
723  cc_out[3].i = *(p_src_r ++);
724 
725  NE10_PRINT_Q2_VECTOR(cc_out);
726 
727  // twiddles[0] = ( 1.0, 0.0);
728  // NE10_CPX_MUL_F32(cc_in[0],cc_out[0],twiddles[0]);
729  cc_in[0] = cc_out[0];
730  twiddles ++;
731 
732  NE10_CPX_MUL_F32(cc_in[1],cc_out[1],twiddles[0]);
733  twiddles ++;
734 
735  NE10_CPX_MUL_F32(cc_in[2],cc_out[2],twiddles[0]);
736  twiddles ++;
737 
738  NE10_CPX_MUL_F32(cc_in[3],cc_out[3],twiddles[0]);
739 
740  // NE10_PRINT_Q2_VECTOR(cc_in);
741 
742  NE10_FFT_R2C_CC_CC(cc_out,cc_in);
743 
744  // NE10_PRINT_Q2_VECTOR(cc_out);
745 
746  dst[1] = cc_out[0];
747  dst += (nfft>>2);
748  dst[ 1] = cc_out[1];
749  dst[-1] = cc_out[3];
750  dst += (nfft>>2);
751  dst[-1] = cc_out[2];
752 }
753 
755  const ne10_fft_cpx_float32_t *src,
756  const ne10_fft_cpx_float32_t *twiddles,
757  const ne10_int32_t nfft)
758 {
759  // b0
760  {
761  ne10_float32_t q_4r_in[4];
762  ne10_float32_t *p_dst_r = (ne10_float32_t*) dst;
763 
764  q_4r_in[0] = src[0].r;
765  q_4r_in[3] = src[0].i;
766  src += (nfft>>2);
767  q_4r_in[1] = src[0].r;
768  q_4r_in[2] = src[0].i;
769  src -= (nfft>>2);
770 
771  NE10_FFT_C2R_RCR_4R(p_dst_r,q_4r_in);
772  }
773 
774  // b2
775  {
776  // float32x4_t q_in;
777  ne10_float32_t *p_dst_r = (ne10_float32_t*) (dst);
778  p_dst_r += nfft;
779  p_dst_r -= 4;
780 
781  ne10_float32_t q_4r_in[4];
782  src += (nfft>>3);
783  q_4r_in[0] = src[0].r;
784  q_4r_in[1] = src[0].i;
785  src += (nfft>>2);
786  q_4r_in[2] = src[0].r;
787  q_4r_in[3] = src[0].i;
788  src -= (nfft>>3);
789  src -= (nfft>>2);
790 
791  NE10_FFT_C2R_CC_4R(p_dst_r,q_4r_in);
792  }
793 
794  // b1
795  ne10_fft_cpx_float32_t cc_out[4];
796  ne10_fft_cpx_float32_t cc_in [4];
797  ne10_float32_t *p_dst_r = (ne10_float32_t*) dst;
798  p_dst_r += 4;
799 
800  // load
801  cc_out[0] = src[1];
802  src += (nfft>>2);
803  cc_out[2] = src[ 1];
804  cc_out[3] = src[-1];
805  src += (nfft>>2);
806  cc_out[1] = src[-1];
807 
808  // NE10_PRINT_Q2_VECTOR(cc_out);
809 
810  NE10_FFT_C2R_CC_CC(cc_in,cc_out);
811 
812  // NE10_PRINT_Q2_VECTOR(cc_in);
813 
814  // twiddles[0] = ( 1.0, 0.0);
815  // NE10_CPX_MUL_F32(cc_in[0],cc_out[0],twiddles[0]);
816  cc_out[0] = cc_in[0];
817  twiddles ++;
818 
819  NE10_CPX_CONJ_MUL_F32(cc_out[1],cc_in[1],twiddles[0]);
820  twiddles ++;
821 
822  NE10_CPX_CONJ_MUL_F32(cc_out[2],cc_in[2],twiddles[0]);
823  twiddles ++;
824 
825  NE10_CPX_CONJ_MUL_F32(cc_out[3],cc_in[3],twiddles[0]);
826 
827  // NE10_PRINT_Q2_VECTOR(cc_out);
828 
829  *(p_dst_r ++) = cc_out[0].r;
830  *(p_dst_r ++) = cc_out[1].r;
831  *(p_dst_r ++) = cc_out[2].r;
832  *(p_dst_r ++) = cc_out[3].r;
833 
834  *(p_dst_r ++) = cc_out[0].i;
835  *(p_dst_r ++) = cc_out[1].i;
836  *(p_dst_r ++) = cc_out[2].i;
837  *(p_dst_r ++) = cc_out[3].i;
838 }
839 
841  const ne10_fft_cpx_float32_t *src,
842  const ne10_fft_cpx_float32_t *twiddles,
843  const ne10_int32_t nfft)
844 {
845  // assert ( nfft % 4 == 0 );
846  const ne10_float32_t *fin_r = (const ne10_float32_t*) src + 12;
847  ne10_float32_t *fout_r = (ne10_float32_t*) dst;
848  const ne10_float32_t *tw = (const ne10_float32_t*) twiddles + 8;
849 
850  ne10_float32_t q_in0[4], q_out0[4],
851  q_in1[4], q_out1[4],
852  q_in2[4], q_out2[4],
853  q_in3[4], q_out3[4];
854 
855  ne10_float32_t q2_tw0[2][4],
856  q2_tw1[2][4];
857 
858  /* INPUT & OUTPUT
859  * 0R 1R 2R 3R Q0
860  * 0I 1I 2I 3I Q1
861  * 4R 5R 6R 7R Q2
862  * 4I 5I 6I 7I Q3
863  */
864 
865  q_in0[0] = *(fin_r++);
866  q_in0[1] = *(fin_r++);
867  q_in0[2] = *(fin_r++);
868  q_in0[3] = *(fin_r++);
869  q_in1[0] = *(fin_r++);
870  q_in1[1] = *(fin_r++);
871  q_in1[2] = *(fin_r++);
872  q_in1[3] = *(fin_r++);
873  q_in2[0] = *(fin_r++);
874  q_in2[1] = *(fin_r++);
875  q_in2[2] = *(fin_r++);
876  q_in2[3] = *(fin_r++);
877  q_in3[0] = *(fin_r++);
878  q_in3[1] = *(fin_r++);
879  q_in3[2] = *(fin_r++);
880  q_in3[3] = *(fin_r++);
881 
882  // NE10_PRINT_Q_VECTOR(q_in0);
883  // NE10_PRINT_Q_VECTOR(q_in1);
884  // NE10_PRINT_Q_VECTOR(q_in2);
885  // NE10_PRINT_Q_VECTOR(q_in3);
886 
887  q2_tw0[0][0] = tw[0];
888  q2_tw0[0][1] = tw[2];
889  q2_tw0[0][2] = tw[4];
890  q2_tw0[0][3] = tw[6];
891  q2_tw0[1][0] = tw[1];
892  q2_tw0[1][1] = tw[3];
893  q2_tw0[1][2] = tw[5];
894  q2_tw0[1][3] = tw[7];
895 
896  q2_tw1[0][0] = tw[0+8];
897  q2_tw1[0][1] = tw[2+8];
898  q2_tw1[0][2] = tw[4+8];
899  q2_tw1[0][3] = tw[6+8];
900  q2_tw1[1][0] = tw[1+8];
901  q2_tw1[1][1] = tw[3+8];
902  q2_tw1[1][2] = tw[5+8];
903  q2_tw1[1][3] = tw[7+8];
904 
905  // TW: in->out
906  q_out0[0] = q_in0[0];
907  q_out1[0] = q_in1[0];
908  q_out2[0] = q_in2[0];
909  q_out3[0] = q_in3[0];
910 
911  //----------------------------------------------------------//
912  // first 2 lines
913  // R R R I I
914  q_out0[1] = q_in0[1] * q2_tw0[0][1] - q_in1[1] * q2_tw0[1][1];
915  // I R I I R
916  q_out1[1] = q_in0[1] * q2_tw0[1][1] + q_in1[1] * q2_tw0[0][1];
917 
918  // R R R I I
919  q_out0[2] = q_in0[2] * q2_tw0[0][2] - q_in1[2] * q2_tw0[1][2];
920  // I R I I R
921  q_out1[2] = q_in0[2] * q2_tw0[1][2] + q_in1[2] * q2_tw0[0][2];
922 
923  // R R R I I
924  q_out0[3] = q_in0[3] * q2_tw0[0][3] - q_in1[3] * q2_tw0[1][3];
925  // I R I I R
926  q_out1[3] = q_in0[3] * q2_tw0[1][3] + q_in1[3] * q2_tw0[0][3];
927 
928  //---------------------------------------------------------//
929  // second 2 lines
930  // R R R I I
931  q_out2[1] = q_in2[1] * q2_tw1[0][1] - q_in3[1] * q2_tw1[1][1];
932  // I R I I R
933  q_out3[1] = q_in2[1] * q2_tw1[1][1] + q_in3[1] * q2_tw1[0][1];
934 
935  // R R R I I
936  q_out2[2] = q_in2[2] * q2_tw1[0][2] - q_in3[2] * q2_tw1[1][2];
937  // I R I I R
938  q_out3[2] = q_in2[2] * q2_tw1[1][2] + q_in3[2] * q2_tw1[0][2];
939 
940  // R R R I I
941  q_out2[3] = q_in2[3] * q2_tw1[0][3] - q_in3[3] * q2_tw1[1][3];
942  // I R I I R
943  q_out3[3] = q_in2[3] * q2_tw1[1][3] + q_in3[3] * q2_tw1[0][3];
944 
945  // NE10_PRINT_Q_VECTOR(q_out0);
946  // NE10_PRINT_Q_VECTOR(q_out1);
947  // NE10_PRINT_Q_VECTOR(q_out2);
948  // NE10_PRINT_Q_VECTOR(q_out3);
949 
950  // BUTTERFLY - radix 4x2
951  // STAGE
952  // q_out -> q_in
953  // R i R j R k
954  q_in0[0] = q_out0[0] + q_out0[2];
955  q_in1[0] = q_out1[0] + q_out1[2];
956 
957  q_in0[1] = q_out0[0] - q_out0[2];
958  q_in1[1] = q_out1[0] - q_out1[2];
959 
960  // R i R j R k
961  q_in0[2] = q_out0[1] + q_out0[3];
962  q_in1[2] = q_out1[1] + q_out1[3];
963 
964  q_in0[3] = q_out0[1] - q_out0[3];
965  q_in1[3] = q_out1[1] - q_out1[3];
966 
967  // R i R j R k
968  q_in2[0] = q_out2[0] + q_out2[2];
969  q_in3[0] = q_out3[0] + q_out3[2];
970 
971  q_in2[1] = q_out2[0] - q_out2[2];
972  q_in3[1] = q_out3[0] - q_out3[2];
973 
974  // R i R j R k
975  q_in2[2] = q_out2[1] + q_out2[3];
976  q_in3[2] = q_out3[1] + q_out3[3];
977 
978  q_in2[3] = q_out2[1] - q_out2[3];
979  q_in3[3] = q_out3[1] - q_out3[3];
980 
981  // NE10_PRINT_Q_VECTOR(q_in0);
982  // NE10_PRINT_Q_VECTOR(q_in1);
983  // NE10_PRINT_Q_VECTOR(q_in2);
984  // NE10_PRINT_Q_VECTOR(q_in3);
985 
986  // STAGE
987  // q_in -> q_out
988  // and transpose
989  // R i R j R k
990  q_out0[0] = q_in0[0] + q_in0[2];
991  q_out0[1] = q_in1[0] + q_in1[2];
992 
993  q_out2[2] = q_in0[0] - q_in0[2];
994  q_out2[3] = - q_in1[0] + q_in1[2];// CONJ
995 
996  // R i R j R k
997  q_out3[2] = q_in0[1] - q_in1[3];
998  q_out3[3] = - q_in1[1] - q_in0[3];// CONJ
999 
1000  q_out1[0] = q_in0[1] + q_in1[3];
1001  q_out1[1] = q_in1[1] - q_in0[3];
1002 
1003  // R i R j R k
1004  q_out0[2] = q_in2[0] + q_in2[2];
1005  q_out0[3] = q_in3[0] + q_in3[2];
1006 
1007  q_out2[0] = q_in2[0] - q_in2[2];
1008  q_out2[1] = - q_in3[0] + q_in3[2];// CONJ
1009 
1010  // R i R j R k
1011  q_out3[0] = q_in2[1] - q_in3[3];
1012  q_out3[1] = - q_in3[1] - q_in2[3]; // CONJ
1013 
1014  q_out1[2] = q_in2[1] + q_in3[3];
1015  q_out1[3] = q_in3[1] - q_in2[3];
1016 
1017  // NE10_PRINT_Q_VECTOR(q_out0);
1018  // NE10_PRINT_Q_VECTOR(q_out1);
1019  // NE10_PRINT_Q_VECTOR(q_out2);
1020  // NE10_PRINT_Q_VECTOR(q_out3);
1021 
1022  // STORE
1023  fout_r += 4;
1024  fout_r[0] = q_out0[0];
1025  fout_r[1] = q_out0[1];
1026  fout_r[2] = q_out0[2];
1027  fout_r[3] = q_out0[3];
1028 
1029  fout_r += (nfft>>1);
1030  fout_r[0] = q_out1[0];
1031  fout_r[1] = q_out1[1];
1032  fout_r[2] = q_out1[2];
1033  fout_r[3] = q_out1[3];
1034 
1035  fout_r -= 10;
1036  fout_r[0] = q_out3[0];
1037  fout_r[1] = q_out3[1];
1038  fout_r[2] = q_out3[2];
1039  fout_r[3] = q_out3[3];
1040 
1041  fout_r += (nfft>>1);
1042  fout_r[0] = q_out2[0];
1043  fout_r[1] = q_out2[1];
1044  fout_r[2] = q_out2[2];
1045  fout_r[3] = q_out2[3];
1046 }
1047 
1049  const ne10_fft_cpx_float32_t *src,
1050  const ne10_fft_cpx_float32_t *twiddles,
1051  const ne10_int32_t nfft)
1052 {
1053  const ne10_float32_t *fin_r = (const ne10_float32_t*) src;
1054  ne10_float32_t *fout_r = (ne10_float32_t*) dst + 12;
1055  const ne10_float32_t *tw = (const ne10_float32_t*) twiddles + 8;
1056 
1057  ne10_float32_t q_in0[4], q_out0[4],
1058  q_in1[4], q_out1[4],
1059  q_in2[4], q_out2[4],
1060  q_in3[4], q_out3[4];
1061 
1062  ne10_float32_t q2_tw0[2][4],
1063  q2_tw1[2][4];
1064 
1065  /* INPUT & OUTPUT
1066  * 0R 1R 2R 3R Q0
1067  * 0I 1I 2I 3I Q1
1068  * 4R 5R 6R 7R Q2
1069  * 4I 5I 6I 7I Q3
1070  */
1071 
1072  // load
1073  fin_r += 4;
1074  q_in0[0] = fin_r[0];
1075  q_in0[1] = fin_r[1];
1076  q_in0[2] = fin_r[2];
1077  q_in0[3] = fin_r[3];
1078 
1079  fin_r += (nfft>>1);
1080  q_in1[0] = fin_r[0];
1081  q_in1[1] = fin_r[1];
1082  q_in1[2] = fin_r[2];
1083  q_in1[3] = fin_r[3];
1084 
1085  fin_r -= 10;
1086  q_in3[0] = fin_r[0];
1087  q_in3[1] = fin_r[1];
1088  q_in3[2] = fin_r[2];
1089  q_in3[3] = fin_r[3];
1090 
1091  fin_r += (nfft>>1);
1092  q_in2[0] = fin_r[0];
1093  q_in2[1] = fin_r[1];
1094  q_in2[2] = fin_r[2];
1095  q_in2[3] = fin_r[3];
1096 
1097  // NE10_PRINT_Q_VECTOR(q_in0);
1098  // NE10_PRINT_Q_VECTOR(q_in1);
1099  // NE10_PRINT_Q_VECTOR(q_in2);
1100  // NE10_PRINT_Q_VECTOR(q_in3);
1101 
1102  // OUTPUT
1103  // INPUT
1104 #define NE10_INV_BUTTERFLY_TMP(I1,I2,J1,J2,K1,K2,S1,S2) do { \
1105  q_out ## I1 [I2] = ( q_in ## K1 [K2] + q_in ## S1 [S2] ); \
1106  q_out ## J1 [J2] = ( q_in ## K1 [K2] - q_in ## S1 [S2] ); \
1107 } while(0);
1108 
1109  // STAGE
1110  // q_in -> q_out
1111  // and transpose
1112  NE10_INV_BUTTERFLY_TMP( 0,0, 0,2,
1113  0,0, 2,2);
1114 
1115  NE10_INV_BUTTERFLY_TMP( 1,2, 1,0,
1116  0,1, 2,3);
1117 
1118  NE10_INV_BUTTERFLY_TMP( 0,1, 1,3,
1119  1,0, 3,2);
1120 
1121  q_in3[3] *= - 1.0f;
1122  NE10_INV_BUTTERFLY_TMP( 1,1, 0,3,
1123  3,3, 1,1);
1124 
1125  NE10_INV_BUTTERFLY_TMP( 2,0, 2,2,
1126  0,2, 2,0);
1127 
1128  NE10_INV_BUTTERFLY_TMP( 3,2, 3,0,
1129  0,3, 2,1);
1130 
1131  NE10_INV_BUTTERFLY_TMP( 2,1, 3,3,
1132  1,2, 3,0);
1133 
1134  q_in3[1] *= - 1.0f;
1135  NE10_INV_BUTTERFLY_TMP( 3,1, 2,3,
1136  3,1, 1,3);
1137 #undef NE10_INV_BUTTERFLY_TMP
1138 
1139  // NE10_PRINT_Q_VECTOR(q_out0);
1140  // NE10_PRINT_Q_VECTOR(q_out1);
1141  // NE10_PRINT_Q_VECTOR(q_out2);
1142  // NE10_PRINT_Q_VECTOR(q_out3);
1143 
1144  // BUTTERFLY - radix 4x2
1145  // STAGE
1146  // q_out -> q_in
1147 
1148  // OUTPUT
1149  // INPUT
1150 #define NE10_INV_BUTTERFLY_TMP(I1,I2,J1,J2,K1,K2,S1,S2) do { \
1151  q_in ## I1 [I2] = ( q_out ## K1 [K2] + q_out ## S1 [S2] ); \
1152  q_in ## J1 [J2] = ( q_out ## K1 [K2] - q_out ## S1 [S2] ); \
1153 } while(0);
1154 
1155  NE10_INV_BUTTERFLY_TMP(0,0, 0,2,
1156  0,0, 0,1);
1157 
1158  NE10_INV_BUTTERFLY_TMP(1,0, 1,2,
1159  1,0, 1,1);
1160 
1161  NE10_INV_BUTTERFLY_TMP(0,1, 0,3,
1162  0,2, 0,3);
1163 
1164  NE10_INV_BUTTERFLY_TMP(1,1, 1,3,
1165  1,2, 1,3);
1166 
1167  NE10_INV_BUTTERFLY_TMP(2,0, 2,2,
1168  2,0, 2,1);
1169 
1170  NE10_INV_BUTTERFLY_TMP(3,0, 3,2,
1171  3,0, 3,1);
1172 
1173 
1174  NE10_INV_BUTTERFLY_TMP(2,1, 2,3,
1175  2,2, 2,3);
1176 
1177  NE10_INV_BUTTERFLY_TMP(3,1, 3,3,
1178  3,2, 3,3);
1179 
1180  // NE10_PRINT_Q_VECTOR(q_in0);
1181  // NE10_PRINT_Q_VECTOR(q_in1);
1182  // NE10_PRINT_Q_VECTOR(q_in2);
1183  // NE10_PRINT_Q_VECTOR(q_in3);
1184 #undef NE10_INV_BUTTERFLY_TMP
1185 
1186  // load tw
1187  q2_tw0[0][0] = tw[0];
1188  q2_tw0[0][1] = tw[2];
1189  q2_tw0[0][2] = tw[4];
1190  q2_tw0[0][3] = tw[6];
1191  q2_tw0[1][0] = tw[1];
1192  q2_tw0[1][1] = tw[3];
1193  q2_tw0[1][2] = tw[5];
1194  q2_tw0[1][3] = tw[7];
1195 
1196  q2_tw1[0][0] = tw[0+8];
1197  q2_tw1[0][1] = tw[2+8];
1198  q2_tw1[0][2] = tw[4+8];
1199  q2_tw1[0][3] = tw[6+8];
1200  q2_tw1[1][0] = tw[1+8];
1201  q2_tw1[1][1] = tw[3+8];
1202  q2_tw1[1][2] = tw[5+8];
1203  q2_tw1[1][3] = tw[7+8];
1204 
1205  // TW: in->out
1206  q_out0[0] = q_in0[0];
1207  q_out1[0] = q_in1[0];
1208  q_out2[0] = q_in2[0];
1209  q_out3[0] = q_in3[0];
1210 
1211  //----------------------------------------------------------//
1212  // first 2 lines
1213  // R R R I I
1214  q_out0[1] = q_in0[1] * q2_tw0[0][1] + q_in1[1] * q2_tw0[1][1];
1215  // I R I I R
1216  q_out1[1] = q_in0[1] * q2_tw0[1][1] - q_in1[1] * q2_tw0[0][1];
1217 
1218  // R R R I I
1219  q_out0[2] = q_in0[2] * q2_tw0[0][2] + q_in1[2] * q2_tw0[1][2];
1220  // I R I I R
1221  q_out1[2] = q_in0[2] * q2_tw0[1][2] - q_in1[2] * q2_tw0[0][2];
1222 
1223  // R R R I I
1224  q_out0[3] = q_in0[3] * q2_tw0[0][3] + q_in1[3] * q2_tw0[1][3];
1225  // I R I I R
1226  q_out1[3] = q_in0[3] * q2_tw0[1][3] - q_in1[3] * q2_tw0[0][3];
1227 
1228  //----------------------------------------------------------//
1229  // second 2 lines
1230  // R R R I I
1231  q_out2[1] = q_in2[1] * q2_tw1[0][1] + q_in3[1] * q2_tw1[1][1];
1232  // I R I I R
1233  q_out3[1] = q_in2[1] * q2_tw1[1][1] - q_in3[1] * q2_tw1[0][1];
1234 
1235  // R R R I I
1236  q_out2[2] = q_in2[2] * q2_tw1[0][2] + q_in3[2] * q2_tw1[1][2];
1237  // I R I I R
1238  q_out3[2] = q_in2[2] * q2_tw1[1][2] - q_in3[2] * q2_tw1[0][2];
1239 
1240  // R R R I I
1241  q_out2[3] = q_in2[3] * q2_tw1[0][3] + q_in3[3] * q2_tw1[1][3];
1242  // I R I I R
1243  q_out3[3] = q_in2[3] * q2_tw1[1][3] - q_in3[3] * q2_tw1[0][3];
1244 
1245  // STORE
1246  *(fout_r++) = q_out0[0];
1247  *(fout_r++) = q_out0[1];
1248  *(fout_r++) = q_out0[2];
1249  *(fout_r++) = q_out0[3];
1250  *(fout_r++) = q_out1[0];
1251  *(fout_r++) = - q_out1[1];
1252  *(fout_r++) = - q_out1[2];
1253  *(fout_r++) = - q_out1[3];
1254  *(fout_r++) = q_out2[0];
1255  *(fout_r++) = q_out2[1];
1256  *(fout_r++) = q_out2[2];
1257  *(fout_r++) = q_out2[3];
1258  *(fout_r++) = q_out3[0];
1259  *(fout_r++) = - q_out3[1];
1260  *(fout_r++) = - q_out3[2];
1261  *(fout_r++) = - q_out3[3];
1262 }
1263 
1265  const ne10_fft_cpx_float32_t *src,
1266  const ne10_fft_cpx_float32_t *twiddles,
1267  const ne10_int32_t nfft)
1268 {
1269  const ne10_float32_t *fin_r = ((const ne10_float32_t*) src) + 12 + 16;
1270  ne10_float32_t *fout_r = (ne10_float32_t*) dst + 8;
1271  ne10_float32_t *fout_b = (ne10_float32_t*) dst - 14;
1272  const ne10_float32_t *tw = ((const ne10_float32_t*) twiddles) + 8 + 16;
1273 
1274  // Take 4 elements as a set.
1275  // The leading 8 sets are already transformed in first and seconds butterflies.
1276  // This function transforms 8 sets in each loop.
1277  ne10_int32_t loop_count = ((nfft >> 2) - 8) >> 3;
1278 
1279  for (; loop_count > 0; loop_count--)
1280  {
1281  NE10_DECLARE_4 (float32x4x2_t, q2_in); // 8Q
1282  NE10_DECLARE_3 (float32x4x2_t, q2_tw); // 6Q
1283  NE10_DECLARE_4 (float32x4x2_t, q2_out); // 8Q
1284 
1285  /* INPUT
1286  * 0R 1R 2R 3R Q0
1287  * 0I 1I 2I 3I Q1
1288  * 4R 5R 6R 7R Q2
1289  * 4I 5I 6I 7I Q3
1290  * 8R 9R aR bR Q4
1291  * 8I 9I aI bI Q5
1292  * cR dR eR fR Q6
1293  * cI dI eI fI Q7
1294  */
1295 
1296  // transpose
1297  // q2_out -> q2_in
1298  /*
1299  * val[0]
1300  * 0R 4R 8R cR Q0
1301  * 1R 5R 9R dR Q2
1302  * 2R 6R aR eR Q4
1303  * 3R 7R bR fR Q6
1304  *
1305  * val[1]
1306  * 0I 4I 8I cI Q1
1307  * 1I 5I 9I dI Q3
1308  * 2I 6I aI eI Q5
1309  * 3I 7I bI fI Q7
1310  */
1311 
1312 #ifndef NE10_INLINE_ASM_OPT
1313  q2_out0.val[0] = vld1q_f32 (fin_r);
1314  fin_r += 4;
1315  q2_out0.val[1] = vld1q_f32 (fin_r);
1316  fin_r += 4;
1317  q2_out1.val[0] = vld1q_f32 (fin_r);
1318  fin_r += 4;
1319  q2_out1.val[1] = vld1q_f32 (fin_r);
1320  fin_r += 4;
1321  q2_out2.val[0] = vld1q_f32 (fin_r);
1322  fin_r += 4;
1323  q2_out2.val[1] = vld1q_f32 (fin_r);
1324  fin_r += 4;
1325  q2_out3.val[0] = vld1q_f32 (fin_r);
1326  fin_r += 4;
1327  q2_out3.val[1] = vld1q_f32 (fin_r);
1328  fin_r += 4;
1329 
1330  NE10_RADIX4X4C_TRANSPOSE_NEON (q2_in, q2_out);
1331 #else // NE10_INLINE_ASM_OPT
1332 #ifndef __aarch64__
1333 #error Currently, inline assembly optimizations are only available on AArch64.
1334 #else // __aarch64__
1335  asm volatile (
1336  "ld1 {v0.4s}, [%[fin_r]], 16 \n\t" // q2_in0.val[0]
1337  "ld1 {v4.4s}, [%[fin_r]], 16 \n\t" // q2_in0.val[1]
1338  "ld1 {v1.4s}, [%[fin_r]], 16 \n\t" // q2_in1.val[0]
1339  "ld1 {v5.4s}, [%[fin_r]], 16 \n\t" // q2_in1.val[1]
1340  "ld1 {v2.4s}, [%[fin_r]], 16 \n\t" // q2_in2.val[0]
1341  "ld1 {v6.4s}, [%[fin_r]], 16 \n\t" // q2_in2.val[1]
1342  "ld1 {v3.4s}, [%[fin_r]], 16 \n\t" // q2_in3.val[0]
1343  "ld1 {v7.4s}, [%[fin_r]], 16 \n\t" // q2_in3.val[1]
1344  // NE10_RADIX4X4C_TRANSPOSE_NEON (q2_in,q2_out);
1345  "trn1 v8.4s, v0.4s, v1.4s \n\t"
1346  "trn2 v9.4s, v0.4s, v1.4s \n\t"
1347  "trn1 v10.4s, v2.4s, v3.4s \n\t"
1348  "trn2 v11.4s, v2.4s, v3.4s \n\t"
1349 
1350  "trn1 %[q2_in0r].2d, v8.2d, v10.2d \n\t"
1351  "trn1 %[q2_in1r].2d, v9.2d, v11.2d \n\t"
1352  "trn2 %[q2_in2r].2d, v8.2d, v10.2d \n\t"
1353  "trn2 %[q2_in3r].2d, v9.2d, v11.2d \n\t"
1354 
1355  "trn1 v8.4s, v4.4s, v5.4s \n\t"
1356  "trn2 v9.4s, v4.4s, v5.4s \n\t"
1357  "trn1 v10.4s, v6.4s, v7.4s \n\t"
1358  "trn2 v11.4s, v6.4s, v7.4s \n\t"
1359 
1360  "trn1 %[q2_in0i].2d, v8.2d, v10.2d \n\t"
1361  "trn1 %[q2_in1i].2d, v9.2d, v11.2d \n\t"
1362  "trn2 %[q2_in2i].2d, v8.2d, v10.2d \n\t"
1363  "trn2 %[q2_in3i].2d, v9.2d, v11.2d \n\t"
1364 
1365  : [q2_in0r]"+w"(q2_in0.val[0]),
1366  [q2_in0i]"+w"(q2_in0.val[1]),
1367  [q2_in1r]"+w"(q2_in1.val[0]),
1368  [q2_in1i]"+w"(q2_in1.val[1]),
1369  [q2_in2r]"+w"(q2_in2.val[0]),
1370  [q2_in2i]"+w"(q2_in2.val[1]),
1371  [q2_in3r]"+w"(q2_in3.val[0]),
1372  [q2_in3i]"+w"(q2_in3.val[1]),
1373  [fin_r]"+r"(fin_r)
1374  :
1375  : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1376  "v8", "v9", "v10", "v11"
1377  );
1378 #endif // __aarch64__
1379 #endif // NE10_INLINE_ASM_OPT
1380 
1381 #ifndef NE10_INLINE_ASM_OPT
1382  // Load twiddles
1383  q2_tw0 = vld2q_f32 (tw);
1384  tw += 8;
1385  q2_tw1 = vld2q_f32 (tw);
1386  tw += 8;
1387  q2_tw2 = vld2q_f32 (tw);
1388  tw += 8;
1389 
1390  // tw
1391  // q2_in -> q2_out
1392  q2_out0 = q2_in0;
1393  NE10_CPX_MUL_NEON_F32 (q2_out1, q2_in1, q2_tw0);
1394  NE10_CPX_MUL_NEON_F32 (q2_out2, q2_in2, q2_tw1);
1395  NE10_CPX_MUL_NEON_F32 (q2_out3, q2_in3, q2_tw2);
1396 #else // NE10_INLINE_ASM_OPT
1397 #ifndef __aarch64__
1398 #error Currently, inline assembly optimizations are only available on AArch64.
1399 #else // __aarch64__
1400  asm volatile (
1401  // Load twiddles
1402  "ld2 {v0.4s, v1.4s}, [%[tw0]] \n\t" // q2_tw0
1403  "ld2 {v2.4s, v3.4s}, [%[tw1]] \n\t" // q2_tw1
1404  "ld2 {v4.4s, v5.4s}, [%[tw2]] \n\t" // q2_tw2
1405  // tw
1406  // q2_in -> q2_out
1407  // NE10_CPX_MUL_NEON_F32(q2_out1,q2_in1,q2_tw0);
1408  "fmul %[q2_out1r].4s, v0.4s, %[q2_in1r].4s \n\t" // RR
1409  "fmul %[q2_out1i].4s, v0.4s, %[q2_in1i].4s \n\t" // RI
1410  "fmls %[q2_out1r].4s, v1.4s, %[q2_in1i].4s \n\t" // RR - II
1411  "fmla %[q2_out1i].4s, v1.4s, %[q2_in1r].4s \n\t" // RI + IR
1412  // NE10_CPX_MUL_NEON_F32(q2_out2,q2_in2,q2_tw1);
1413  "fmul %[q2_out2r].4s, v2.4s, %[q2_in2r].4s \n\t" // RR
1414  "fmul %[q2_out2i].4s, v2.4s, %[q2_in2i].4s \n\t" // RI
1415  "fmls %[q2_out2r].4s, v3.4s, %[q2_in2i].4s \n\t" // RR - II
1416  "fmla %[q2_out2i].4s, v3.4s, %[q2_in2r].4s \n\t" // RI + IR
1417  // NE10_CPX_MUL_NEON_F32(q2_out3,q2_in3,q2_tw2);
1418  "fmul %[q2_out3r].4s, v4.4s, %[q2_in3r].4s \n\t" // RR
1419  "fmul %[q2_out3i].4s, v4.4s, %[q2_in3i].4s \n\t" // RI
1420  "fmls %[q2_out3r].4s, v5.4s, %[q2_in3i].4s \n\t" // RR - II
1421  "fmla %[q2_out3i].4s, v5.4s, %[q2_in3r].4s \n\t" // RI + IR
1422  : [q2_out1r]"+w"(q2_out1.val[0]),
1423  [q2_out1i]"+w"(q2_out1.val[1]),
1424  [q2_out2r]"+w"(q2_out2.val[0]),
1425  [q2_out2i]"+w"(q2_out2.val[1]),
1426  [q2_out3r]"+w"(q2_out3.val[0]),
1427  [q2_out3i]"+w"(q2_out3.val[1])
1428  : [tw0]"r"(tw),
1429  [tw1]"r"(tw + 8),
1430  [tw2]"r"(tw + 16),
1431  [q2_in1r]"w"(q2_in1.val[0]),
1432  [q2_in1i]"w"(q2_in1.val[1]),
1433  [q2_in2r]"w"(q2_in2.val[0]),
1434  [q2_in2i]"w"(q2_in2.val[1]),
1435  [q2_in3r]"w"(q2_in3.val[0]),
1436  [q2_in3i]"w"(q2_in3.val[1])
1437  : "memory", "v0", "v1", "v2", "v3", "v4", "v5"
1438  );
1439  q2_out0 = q2_in0;
1440  tw += 24;
1441 #endif // __aarch64__
1442 #endif // NE10_INLINE_ASM_OPT
1443 
1444  // butterfly
1445  // out -> in
1446  q2_in0.val[0] = vaddq_f32 (q2_out0.val[0], q2_out2.val[0]);
1447  q2_in0.val[1] = vaddq_f32 (q2_out0.val[1], q2_out2.val[1]);
1448  q2_in1.val[0] = vsubq_f32 (q2_out0.val[0], q2_out2.val[0]);
1449  q2_in1.val[1] = vsubq_f32 (q2_out0.val[1], q2_out2.val[1]);
1450  q2_in2.val[0] = vaddq_f32 (q2_out1.val[0], q2_out3.val[0]);
1451  q2_in2.val[1] = vaddq_f32 (q2_out1.val[1], q2_out3.val[1]);
1452  q2_in3.val[0] = vsubq_f32 (q2_out1.val[0], q2_out3.val[0]);
1453  q2_in3.val[1] = vsubq_f32 (q2_out1.val[1], q2_out3.val[1]);
1454 
1455  // in -> out
1456  q2_out2.val[0] = vsubq_f32 (q2_in0.val[0], q2_in2.val[0]);
1457  q2_out2.val[1] = vsubq_f32 (q2_in0.val[1], q2_in2.val[1]);
1458  q2_out3.val[0] = vsubq_f32 (q2_in1.val[0], q2_in3.val[1]);
1459  q2_out3.val[1] = vaddq_f32 (q2_in1.val[1], q2_in3.val[0]);
1460 
1461  q2_out3.val[1] = vnegq_f32 (q2_out3.val[1]);
1462  q2_out2.val[1] = vnegq_f32 (q2_out2.val[1]);
1463 
1464 #ifndef NE10_INLINE_ASM_OPT
1465  q2_out0.val[0] = vaddq_f32 (q2_in0.val[0], q2_in2.val[0]);
1466  q2_out0.val[1] = vaddq_f32 (q2_in0.val[1], q2_in2.val[1]);
1467 
1468  q2_out1.val[0] = vaddq_f32 (q2_in1.val[0], q2_in3.val[1]);
1469  q2_out1.val[1] = vsubq_f32 (q2_in1.val[1], q2_in3.val[0]);
1470 
1471  // reverse -- CONJ
1472  NE10_REVERSE_FLOAT32X4 (q2_out2.val[0]);
1473  NE10_REVERSE_FLOAT32X4 (q2_out2.val[1]);
1474  NE10_REVERSE_FLOAT32X4 (q2_out3.val[0]);
1475  NE10_REVERSE_FLOAT32X4 (q2_out3.val[1]);
1476 
1477  // store
1478  vst2q_f32 (fout_r, q2_out0);
1479  vst2q_f32 (fout_r + (nfft >> 1), q2_out1);
1480  vst2q_f32 (fout_b + (nfft >> 1), q2_out3);
1481  vst2q_f32 (fout_b + nfft, q2_out2);
1482 #else // NE10_INLINE_ASM_OPT
1483 #ifndef __aarch64__
1484 #error Currently, inline assembly optimizations are only available on AArch64.
1485 #else // __aarch64__
1486  asm volatile (
1487  "fadd v0.4s, %[q2_in0r].4s, %[q2_in2r].4s \n\t"
1488  "fadd v1.4s, %[q2_in0i].4s, %[q2_in2i].4s \n\t"
1489  "fadd v2.4s, %[q2_in1r].4s, %[q2_in3i].4s \n\t"
1490  "fsub v3.4s, %[q2_in1i].4s, %[q2_in3r].4s \n\t"
1491  // reverse -- CONJ
1492  "rev64 %[q2_in2r].4s, %[q2_out2r].4s \n\t"
1493  "rev64 %[q2_in2i].4s, %[q2_out2i].4s \n\t"
1494  "rev64 %[q2_in3r].4s, %[q2_out3r].4s \n\t"
1495  "rev64 %[q2_in3i].4s, %[q2_out3i].4s \n\t"
1496  "ext v4.16b, %[q2_in2r].16b, %[q2_in2r].16b, #8 \n\t"
1497  "ext v5.16b, %[q2_in2i].16b, %[q2_in2i].16b, #8 \n\t"
1498  "ext v6.16b, %[q2_in3r].16b, %[q2_in3r].16b, #8 \n\t"
1499  "ext v7.16b, %[q2_in3i].16b, %[q2_in3i].16b, #8 \n\t"
1500  // store
1501  "st2 {v0.4s, v1.4s}, [%[fout0]] \n\t"
1502  "st2 {v2.4s, v3.4s}, [%[fout1]] \n\t"
1503  "st2 {v4.4s, v5.4s}, [%[fout2]] \n\t"
1504  "st2 {v6.4s, v7.4s}, [%[fout3]] \n\t"
1505  :
1506  : [fout0]"r"(fout_r),
1507  [fout1]"r"(fout_r + (nfft>>1)),
1508  [fout2]"r"(fout_b + nfft),
1509  [fout3]"r"(fout_b + (nfft>>1)),
1510  [q2_out2r]"w"(q2_out2.val[0]),
1511  [q2_out2i]"w"(q2_out2.val[1]),
1512  [q2_out3r]"w"(q2_out3.val[0]),
1513  [q2_out3i]"w"(q2_out3.val[1]),
1514  [q2_in0r]"w"(q2_in0.val[0]),
1515  [q2_in0i]"w"(q2_in0.val[1]),
1516  [q2_in1r]"w"(q2_in1.val[0]),
1517  [q2_in1i]"w"(q2_in1.val[1]),
1518  [q2_in2r]"w"(q2_in2.val[0]),
1519  [q2_in2i]"w"(q2_in2.val[1]),
1520  [q2_in3r]"w"(q2_in3.val[0]),
1521  [q2_in3i]"w"(q2_in3.val[1])
1522  : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1523  );
1524 #endif // __aarch64__
1525 #endif // NE10_INLINE_ASM_OPT
1526 
1527  fout_r += 8;
1528  fout_b -= 8;
1529  }
1530 }
1531 
1533  const ne10_fft_cpx_float32_t *src,
1534  const ne10_fft_cpx_float32_t *twiddles,
1535  const ne10_int32_t nfft)
1536 {
1537  ne10_float32_t *fout_r = ((ne10_float32_t*) dst ) + 12 + 16 ;
1538  const ne10_float32_t *fin_r = (const ne10_float32_t*) src + 8;
1539  const ne10_float32_t *fin_b = (const ne10_float32_t*) src - 14;
1540  const ne10_float32_t *tw = ((const ne10_float32_t*) twiddles) + 8 + 16;
1541  ne10_int32_t loop_count = ((nfft>>2)-8)>>3;
1542 
1543  for ( ; loop_count>0; loop_count -- )
1544  {
1545  NE10_DECLARE_4(float32x4x2_t,q2_in); // 8Q
1546  NE10_DECLARE_3(float32x4x2_t,q2_tw); // 6Q
1547  NE10_DECLARE_4(float32x4x2_t,q2_out); // 8Q
1548 
1549  /* INPUT
1550  * 0R 1R 2R 3R Q0
1551  * 0I 1I 2I 3I Q1
1552  * 4R 5R 6R 7R Q2
1553  * 4I 5I 6I 7I Q3
1554  * 8R 9R aR bR Q4
1555  * 8I 9I aI bI Q5
1556  * cR dR eR fR Q6
1557  * cI dI eI fI Q7
1558  */
1559 
1560  q2_in0 = vld2q_f32(fin_r );
1561  q2_in1 = vld2q_f32(fin_r + (nfft>>1));
1562  fin_r += 8;
1563 
1564  q2_in3 = vld2q_f32(fin_b + (nfft>>1));
1565  q2_in2 = vld2q_f32(fin_b + nfft );
1566  fin_b -= 8;
1567 
1568  q2_tw0 = vld2q_f32(tw);
1569  tw += 8;
1570  q2_tw1 = vld2q_f32(tw);
1571  tw += 8;
1572  q2_tw2 = vld2q_f32(tw);
1573  tw += 8;
1574 
1575  // reverse -- CONJ
1576  NE10_REVERSE_FLOAT32X4( q2_in3.val[0] );
1577  NE10_REVERSE_FLOAT32X4( q2_in3.val[1] );
1578  NE10_REVERSE_FLOAT32X4( q2_in2.val[0] );
1579  NE10_REVERSE_FLOAT32X4( q2_in2.val[1] );
1580 
1581  q2_in2.val[1] = vnegq_f32( q2_in2.val[1] );
1582  q2_in3.val[1] = vnegq_f32( q2_in3.val[1] );
1583 
1584  // in -> out
1585  q2_out0.val[0] = vaddq_f32 (q2_in0.val[0], q2_in2.val[0]);
1586  q2_out2.val[0] = vsubq_f32 (q2_in0.val[0], q2_in2.val[0]);
1587 
1588  q2_out0.val[1] = vaddq_f32 (q2_in0.val[1], q2_in2.val[1]);
1589  q2_out2.val[1] = vsubq_f32 (q2_in0.val[1], q2_in2.val[1]);
1590 
1591  q2_out1.val[0] = vaddq_f32 (q2_in1.val[0], q2_in3.val[0]);
1592  q2_out3.val[1] = vsubq_f32 (q2_in1.val[0], q2_in3.val[0]);
1593 
1594  q2_out1.val[1] = vaddq_f32 (q2_in3.val[1], q2_in1.val[1]);
1595  q2_out3.val[0] = vsubq_f32 (q2_in3.val[1], q2_in1.val[1]);
1596 
1597  // out -> in
1598  q2_in0.val[0] = vaddq_f32 (q2_out0.val[0], q2_out1.val[0]);
1599  q2_in2.val[0] = vsubq_f32 (q2_out0.val[0], q2_out1.val[0]);
1600 
1601  q2_in0.val[1] = vaddq_f32 (q2_out0.val[1], q2_out1.val[1]);
1602  q2_in2.val[1] = vsubq_f32 (q2_out0.val[1], q2_out1.val[1]);
1603 
1604  q2_in1.val[0] = vaddq_f32 (q2_out2.val[0], q2_out3.val[0]);
1605  q2_in3.val[0] = vsubq_f32 (q2_out2.val[0], q2_out3.val[0]);
1606 
1607  q2_in1.val[1] = vaddq_f32 (q2_out2.val[1], q2_out3.val[1]);
1608  q2_in3.val[1] = vsubq_f32 (q2_out2.val[1], q2_out3.val[1]);
1609 
1610  // tw
1611  // q2_in -> q2_out
1612  q2_out0 = q2_in0;
1613  NE10_CPX_MUL_INV_NEON_F32(q2_out1,q2_in1,q2_tw0);
1614  NE10_CPX_MUL_INV_NEON_F32(q2_out2,q2_in2,q2_tw1);
1615  NE10_CPX_MUL_INV_NEON_F32(q2_out3,q2_in3,q2_tw2);
1616 
1617  // transpose
1618  // q2_out -> q2_in
1619  NE10_RADIX4X4C_TRANSPOSE_NEON (q2_in,q2_out);
1620 
1621  // store
1622  vst1q_f32(fout_r, q2_in0.val[0]);
1623  fout_r += 4;
1624  vst1q_f32(fout_r, q2_in0.val[1]);
1625  fout_r += 4;
1626  vst1q_f32(fout_r, q2_in1.val[0]);
1627  fout_r += 4;
1628  vst1q_f32(fout_r, q2_in1.val[1]);
1629  fout_r += 4;
1630  vst1q_f32(fout_r, q2_in2.val[0]);
1631  fout_r += 4;
1632  vst1q_f32(fout_r, q2_in2.val[1]);
1633  fout_r += 4;
1634  vst1q_f32(fout_r, q2_in3.val[0]);
1635  fout_r += 4;
1636  vst1q_f32(fout_r, q2_in3.val[1]);
1637  fout_r += 4;
1638  }
1639 }
1640 
1642  const ne10_fft_cpx_float32_t *src,
1643  const ne10_fft_cpx_float32_t *twiddles,
1644  const ne10_int32_t nfft)
1645 {
1647 
1648  if (nfft==16)
1649  {
1650  return;
1651  }
1652 
1654 
1655  if (nfft==32)
1656  {
1657  return;
1658  }
1659 
1661 }
1662 
1664  const ne10_fft_cpx_float32_t *src,
1665  const ne10_fft_cpx_float32_t *twiddles,
1666  const ne10_int32_t nfft)
1667 {
1669 
1670  if (nfft==16)
1671  {
1672  return;
1673  }
1674 
1676 
1677  if (nfft==32)
1678  {
1679  return;
1680  }
1681 
1683 }
1684 
1690  ne10_float32_t *fin,
1692 {
1693  typedef ne10_float32_t REAL;
1694  typedef ne10_fft_cpx_float32_t CPLX;
1695 
1696  ne10_fft_cpx_float32_t * tmpbuf = cfg->buffer;
1697  ne10_float32_t *fout_r = (ne10_float32_t*) fout;
1698 
1699  switch (cfg->nfft)
1700  {
1701  case 2:
1702  ne10_radix2_r2c_c ( (CPLX*) fout_r, (const CPLX*) fin);
1703  fout[0].r = fout[0].i;
1704  break;
1705  case 4:
1706  ne10_radix4_r2c_c ( (CPLX*) fout_r, (const CPLX*) fin, 1, 1, 4);
1707  fout[0].r = fout[0].i;
1708  break;
1709  case 8:
1710  ne10_radix8_r2c_c ( (CPLX*) fout_r, (const CPLX*) fin, 1, 1, 8);
1711  fout[0].r = fout[0].i;
1712  break;
1713  default:
1714  ne10_mixed_radix_r2c_butterfly_float32_neon (fout, (CPLX*) fin, cfg->r_factors_neon, cfg->r_twiddles_neon, tmpbuf);
1715  ne10_radix4_r2c_with_twiddles_last_stage(fout, tmpbuf, cfg->r_super_twiddles_neon, cfg->nfft);
1716  fout[cfg->nfft / 2].r = fout[0].i;
1717  break;
1718  }
1719  fout[0].i = fout[cfg->nfft / 2].i = 0.0f;
1720 }
1721 
1729 {
1730  typedef ne10_float32_t REAL;
1731  typedef ne10_fft_cpx_float32_t CPLX;
1732 
1733  ne10_fft_cpx_float32_t * tmpbuf = cfg->buffer;
1734  ne10_fft_cpx_float32_t * fout_c;
1735  ne10_int32_t stage_count;
1736  ne10_int32_t radix;
1737 
1738  switch (cfg->nfft)
1739  {
1740  case 2:
1741  fin[0].i = fin[0].r;
1742  fin[0].r = 0.0f;
1743  ne10_radix2_c2r_c ( (CPLX*) fout, (const CPLX*) &fin[0].i);
1744  fin[0].r = fin[0].i;
1745  break;
1746  case 4:
1747  fin[0].i = fin[0].r;
1748  fin[0].r = 0.0f;
1749  ne10_radix4_c2r_c ( (CPLX*) fout, (const CPLX*) &fin[0].i, 1, 1, 4);
1750  fin[0].r = fin[0].i;
1751  break;
1752  case 8:
1753  fin[0].i = fin[0].r;
1754  fin[0].r = 0.0f;
1755  ne10_radix8_c2r_c ( (CPLX*) fout, (const CPLX*) &fin[0].i, 1, 1, 8);
1756  fin[0].r = fin[0].i;
1757  break;
1758  default:
1759  stage_count = cfg->r_factors_neon[0];
1760  radix = cfg->r_factors_neon[ stage_count << 1 ];
1761  if (radix==2)
1762  {
1763  stage_count --;
1764  }
1765  fin[0].i = fin[cfg->nfft>>1].r;
1766  fout_c = (stage_count % 2==1) ? tmpbuf : (CPLX*)fout;
1767  ne10_radix4_c2r_with_twiddles_first_stage( (CPLX*) fout_c, fin, cfg->r_super_twiddles_neon, cfg->nfft);
1768  ne10_mixed_radix_c2r_butterfly_float32_neon ( (CPLX*) fout, (CPLX*) NULL, cfg->r_factors_neon, cfg->r_twiddles_neon_backward, tmpbuf);
1769  break;
1770  }
1771  fin[0].i = 0.0f;
1772 }
#define NE10_RADIX4x4_R2C_NEON_LOAD(PTR_IN, Q_IN, IN_STEP)
NE10_INLINE void ne10_radix4x4_c2r_with_twiddles_other_butterfly_neon(float32x4_t *Fout_neon, const float32x4_t *Fin_neon, const ne10_int32_t out_step, const ne10_int32_t in_step, const ne10_fft_cpx_float32_t *twiddles)
#define NE10_DECLARE_4(TYPE, NAME)
NE10_INLINE void ne10_radix4_r2c_with_twiddles_last_stage(ne10_fft_cpx_float32_t *dst, const ne10_fft_cpx_float32_t *src, const ne10_fft_cpx_float32_t *twiddles, const ne10_int32_t nfft)
#define NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S1(Q2_OUT, Q2_IN)
#define NE10_RADIX8x4_C2R_NEON_KERNEL(Q_OUT, Q_IN)
NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_last_butterfly_neon(float32x4_t *Fout_neon, const float32x4_t *Fin_neon, const ne10_int32_t out_step, const ne10_int32_t in_step, const ne10_fft_cpx_float32_t *twiddles)
int32_t ne10_int32_t
Definition: NE10_types.h:76
NE10_INLINE void ne10_mixed_radix_r2c_butterfly_float32_neon(ne10_fft_cpx_float32_t *Fout, const ne10_fft_cpx_float32_t *Fin, const ne10_int32_t *factors, const ne10_fft_cpx_float32_t *twiddles, ne10_fft_cpx_float32_t *buffer)
NE10_INLINE void ne10_radix4x4_c2r_with_twiddles_last_butterfly_neon(float32x4_t *Fout_neon, const float32x4_t *Fin_neon, const ne10_int32_t out_step, const ne10_int32_t in_step, const ne10_fft_cpx_float32_t *twiddles)
#define ne10_swap_ptr(X, Y)
NE10_INLINE void ne10_radix4_r2c_with_twiddles_last_stage_other_butterfly(ne10_fft_cpx_float32_t *dst, const ne10_fft_cpx_float32_t *src, const ne10_fft_cpx_float32_t *twiddles, const ne10_int32_t nfft)
NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_first_butterfly_neon(float32x4_t *Fout_neon, const float32x4_t *Fin_neon, const ne10_int32_t out_step, const ne10_int32_t in_step, const ne10_fft_cpx_float32_t *twiddles)
#define NE10_RADIX8x4_R2C_NEON_LOAD(PTR_IN, Q_IN, IN_STEP)
#define NE10_CPX_MUL_INV_NEON_F32(Z, A, B)
float ne10_float32_t
Definition: NE10_types.h:80
#define NE10_RADIX4x4_R2C_TW_NEON_KERNEL_LAST(Q_OUT, Q_IN)
#define NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S2(Q2_OUT, Q2_IN)
NE10_INLINE void ne10_mixed_radix_c2r_butterfly_float32_neon(ne10_fft_cpx_float32_t *Fout, const ne10_fft_cpx_float32_t *Fin, const ne10_int32_t *factors, const ne10_fft_cpx_float32_t *twiddles, ne10_fft_cpx_float32_t *buffer)
#define NE10_RADIX4x4_R2C_TW_MUL_NEON(Q2_OUT, Q2_IN, Q2_TW)
NE10_INLINE void ne10_radix8x4_r2c_neon(ne10_fft_cpx_float32_t *Fout, const ne10_fft_cpx_float32_t *Fin, const ne10_int32_t fstride, const ne10_int32_t mstride, const ne10_int32_t nfft)
NE10_INLINE void ne10_radix8x4_c2r_neon(ne10_fft_cpx_float32_t *Fout, const ne10_fft_cpx_float32_t *Fin, const ne10_int32_t fstride, const ne10_int32_t mstride, const ne10_int32_t nfft)
#define NE10_FFT_R2C_CC_CC(OUT, IN)
#define NE10_FFT_C2R_RCR_4R(OUT, IN)
Definition: NE10_fft_bfly.h:56
#define NE10_INV_BUTTERFLY_TMP(I1, I2, J1, J2, K1, K2, S1, S2)
#define NE10_RADIX8x4_R2C_NEON_STORE(PTR_OUT, Q_OUT, OUT_STEP)
NE10_INLINE void ne10_radix4x4_c2r_with_twiddles_neon(ne10_fft_cpx_float32_t *Fout, const ne10_fft_cpx_float32_t *Fin, const ne10_int32_t fstride, const ne10_int32_t mstride, const ne10_int32_t nfft, const ne10_fft_cpx_float32_t *twiddles)
#define NE10_DECLARE_3(TYPE, NAME)
NE10_INLINE void ne10_radix4x4_c2r_with_twiddles_first_butterfly_neon(float32x4_t *Fout_neon, const float32x4_t *Fin_neon, const ne10_int32_t out_step, const ne10_int32_t in_step, const ne10_fft_cpx_float32_t *twiddles)
NE10_INLINE void ne10_radix4_c2r_with_twiddles_first_stage(ne10_fft_cpx_float32_t *dst, const ne10_fft_cpx_float32_t *src, const ne10_fft_cpx_float32_t *twiddles, const ne10_int32_t nfft)
NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_other_butterfly_neon(float32x4_t *Fout_neon, const float32x4_t *Fin_neon, const ne10_int32_t out_step, const ne10_int32_t in_step, const ne10_fft_cpx_float32_t *twiddles)
#define NE10_RADIX4x4_C2R_TW_NEON_KERNEL(Q2_OUT, Q2_IN, Q2_TW)
NE10_INLINE void ne10_radix4x4_c2r_neon(ne10_fft_cpx_float32_t *Fout, const ne10_fft_cpx_float32_t *Fin, const ne10_int32_t fstride, const ne10_int32_t mstride, const ne10_int32_t nfft)
#define NE10_RADIX4X4C_TRANSPOSE_NEON(Q2_OUT, Q2_IN)
#define NE10_RADIX4x4_R2C_NEON_STORE(PTR_OUT, Q_OUT, OUT_STEP)
#define NE10_RADIX4x4_C2R_NEON_KERNEL(Q_OUT, Q_IN)
#define NE10_DECLARE_8(TYPE, NAME)
#define NE10_REVERSE_FLOAT32X4(VECTOR4F)
#define NE10_CPX_CONJ_MUL_F32(Z, A, B)
NE10_INLINE void ne10_radix4_c2r_with_twiddles_first_stage_first_butterfly(ne10_fft_cpx_float32_t *dst, const ne10_fft_cpx_float32_t *src, const ne10_fft_cpx_float32_t *twiddles, const ne10_int32_t nfft)
#define NE10_INLINE
Definition: NE10_fft.h:46
NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_neon(ne10_fft_cpx_float32_t *Fout, const ne10_fft_cpx_float32_t *Fin, const ne10_int32_t fstride, const ne10_int32_t mstride, const ne10_int32_t nfft, const ne10_fft_cpx_float32_t *twiddles)
#define NE10_FFT_R2C_4R_CC(OUT, IN)
Definition: NE10_fft_bfly.h:72
#define NE10_RADIX8x4_R2C_NEON_KERNEL(Q_OUT, Q_IN)
void ne10_fft_c2r_1d_float32_neon(ne10_float32_t *fout, ne10_fft_cpx_float32_t *fin, ne10_fft_r2c_cfg_float32_t cfg)
Specific implementation of ne10_fft_c2r_1d_float32 using NEON SIMD capabilities.
NE10_INLINE void ne10_radix4x4_r2c_neon(ne10_fft_cpx_float32_t *Fout, const ne10_fft_cpx_float32_t *Fin, const ne10_int32_t fstride, const ne10_int32_t mstride, const ne10_int32_t nfft)
NE10_INLINE void ne10_radix4_r2c_with_twiddles_last_stage_second_butterfly(ne10_fft_cpx_float32_t *dst, const ne10_fft_cpx_float32_t *src, const ne10_fft_cpx_float32_t *twiddles, const ne10_int32_t nfft)
#define NE10_CPX_MUL_F32(Z, A, B)
ne10_float32_t i
Definition: NE10_types.h:233
ne10_fft_cpx_float32_t * buffer
Definition: NE10_types.h:271
NE10_INLINE void ne10_radix4_c2r_with_twiddles_first_stage_other_butterfly(ne10_fft_cpx_float32_t *dst, const ne10_fft_cpx_float32_t *src, const ne10_fft_cpx_float32_t *twiddles, const ne10_int32_t nfft)
#define NE10_PRINT_Q2_VECTOR(Q2_VECTOR)
#define NE10_FFT_C2R_CC_CC(OUT, IN)
#define NE10_FFT_R2C_4R_RCR(OUT, IN)
Definition: NE10_fft_bfly.h:42
NE10_INLINE void ne10_radix4_c2r_with_twiddles_first_stage_second_butterfly(ne10_fft_cpx_float32_t *dst, const ne10_fft_cpx_float32_t *src, const ne10_fft_cpx_float32_t *twiddles, const ne10_int32_t nfft)
#define NE10_FFT_C2R_CC_4R(OUT, IN)
Definition: NE10_fft_bfly.h:87
NE10_INLINE void ne10_radix4_r2c_with_twiddles_last_stage_first_butterfly(ne10_fft_cpx_float32_t *dst, const ne10_fft_cpx_float32_t *src, const ne10_fft_cpx_float32_t *twiddles, const ne10_int32_t nfft)
#define NE10_CPX_MUL_NEON_F32(Z, A, B)
#define NE10_RADIX4x4_C2R_TW_NEON_KERNEL_LAST(Q_OUT, Q_IN)
ne10_float32_t r
Definition: NE10_types.h:232
#define NE10_RADIX4x4_R2C_NEON_KERNEL(Q_OUT, Q_IN)
void ne10_fft_r2c_1d_float32_neon(ne10_fft_cpx_float32_t *fout, ne10_float32_t *fin, ne10_fft_r2c_cfg_float32_t cfg)
Specific implementation of ne10_fft_r2c_1d_float32 using NEON SIMD capabilities.