Project Ne10
An open, optimized software library for the ARM architecture.
NE10_fft_float32.neon.c
Go to the documentation of this file.
1 /*
2  * Copyright 2013-16 ARM Limited and Contributors.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of ARM Limited nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /*
29  * NE10 Library : dsp/NE10_fft_float32.neon.c
30  */
31 
32 #include <arm_neon.h>
33 
34 #include "NE10_types.h"
35 #include "NE10_macros.h"
36 #include "NE10_fft.h"
37 #include "NE10_dsp.h"
38 
39 static inline void ne10_fft4_forward_float32 (ne10_fft_cpx_float32_t * Fout,
41 {
42  ne10_float32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i;
43  ne10_float32_t tmp_r, tmp_i;
44 
45  s2_r = Fin[0].r - Fin[2].r;
46  s2_i = Fin[0].i - Fin[2].i;
47 
48  tmp_r = Fin[0].r + Fin[2].r;
49  tmp_i = Fin[0].i + Fin[2].i;
50 
51  s0_r = Fin[1].r + Fin[3].r;
52  s0_i = Fin[1].i + Fin[3].i;
53 
54  s1_r = Fin[1].r - Fin[3].r;
55  s1_i = Fin[1].i - Fin[3].i;
56  Fout[2].r = tmp_r - s0_r;
57  Fout[2].i = tmp_i - s0_i;
58  Fout[0].r = tmp_r + s0_r;
59  Fout[0].i = tmp_i + s0_i;
60 
61  Fout[1].r = s2_r + s1_i;
62  Fout[1].i = s2_i - s1_r;
63  Fout[3].r = s2_r - s1_i;
64  Fout[3].i = s2_i + s1_r;
65 }
66 
67 static inline void ne10_fft4_backward_float32 (ne10_fft_cpx_float32_t * Fout,
69 {
70  ne10_float32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i;
71  ne10_float32_t tmp_r, tmp_i;
72 
73  s2_r = Fin[0].r - Fin[2].r;
74  s2_i = Fin[0].i - Fin[2].i;
75 
76  tmp_r = Fin[0].r + Fin[2].r;
77  tmp_i = Fin[0].i + Fin[2].i;
78 
79  s0_r = Fin[1].r + Fin[3].r;
80  s0_i = Fin[1].i + Fin[3].i;
81 
82  s1_r = Fin[1].r - Fin[3].r;
83  s1_i = Fin[1].i - Fin[3].i;
84  Fout[2].r = (tmp_r - s0_r) * 0.25f;
85  Fout[2].i = (tmp_i - s0_i) * 0.25f;
86  Fout[0].r = (tmp_r + s0_r) * 0.25f;
87  Fout[0].i = (tmp_i + s0_i) * 0.25f;
88 
89  Fout[1].r = (s2_r - s1_i) * 0.25f;
90  Fout[1].i = (s2_i + s1_r) * 0.25f;
91  Fout[3].r = (s2_r + s1_i) * 0.25f;
92  Fout[3].i = (s2_i - s1_r) * 0.25f;
93 }
94 
95 static inline void ne10_fft8_forward_float32 (ne10_fft_cpx_float32_t * Fout,
97 {
98  ne10_float32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
99  ne10_float32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
100  const ne10_float32_t TW_81 = 0.70710678;
101 
102  s0_r = Fin[0].r + Fin[4].r;
103  s0_i = Fin[0].i + Fin[4].i;
104  s1_r = Fin[0].r - Fin[4].r;
105  s1_i = Fin[0].i - Fin[4].i;
106  s2_r = Fin[1].r + Fin[5].r;
107  s2_i = Fin[1].i + Fin[5].i;
108  s3_r = Fin[1].r - Fin[5].r;
109  s3_i = Fin[1].i - Fin[5].i;
110  s4_r = Fin[2].r + Fin[6].r;
111  s4_i = Fin[2].i + Fin[6].i;
112  s5_r = Fin[2].r - Fin[6].r;
113  s5_i = Fin[2].i - Fin[6].i;
114  s6_r = Fin[3].r + Fin[7].r;
115  s6_i = Fin[3].i + Fin[7].i;
116  s7_r = Fin[3].r - Fin[7].r;
117  s7_i = Fin[3].i - Fin[7].i;
118 
119  t0_r = s0_r - s4_r;
120  t0_i = s0_i - s4_i;
121  t1_r = s0_r + s4_r;
122  t1_i = s0_i + s4_i;
123  t2_r = s2_r + s6_r;
124  t2_i = s2_i + s6_i;
125  t3_r = s2_r - s6_r;
126  t3_i = s2_i - s6_i;
127  Fout[0].r = t1_r + t2_r;
128  Fout[0].i = t1_i + t2_i;
129  Fout[4].r = t1_r - t2_r;
130  Fout[4].i = t1_i - t2_i;
131  Fout[2].r = t0_r + t3_i;
132  Fout[2].i = t0_i - t3_r;
133  Fout[6].r = t0_r - t3_i;
134  Fout[6].i = t0_i + t3_r;
135 
136  t4_r = (s3_r + s3_i) * TW_81;
137  t4_i = - (s3_r - s3_i) * TW_81;
138  t5_r = (s7_r - s7_i) * TW_81;
139  t5_i = (s7_r + s7_i) * TW_81;
140 
141  t0_r = s1_r - s5_i;
142  t0_i = s1_i + s5_r;
143  t1_r = s1_r + s5_i;
144  t1_i = s1_i - s5_r;
145  t2_r = t4_r - t5_r;
146  t2_i = t4_i - t5_i;
147  t3_r = t4_r + t5_r;
148  t3_i = t4_i + t5_i;
149  Fout[1].r = t1_r + t2_r;
150  Fout[1].i = t1_i + t2_i;
151  Fout[5].r = t1_r - t2_r;
152  Fout[5].i = t1_i - t2_i;
153  Fout[3].r = t0_r + t3_i;
154  Fout[3].i = t0_i - t3_r;
155  Fout[7].r = t0_r - t3_i;
156  Fout[7].i = t0_i + t3_r;
157 }
158 
159 static inline void ne10_fft8_backward_float32 (ne10_fft_cpx_float32_t * Fout,
161 
162 {
163  ne10_float32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
164  ne10_float32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
165  const ne10_float32_t TW_81 = 0.70710678;
166 
167  s0_r = Fin[0].r + Fin[4].r;
168  s0_i = Fin[0].i + Fin[4].i;
169  s1_r = Fin[0].r - Fin[4].r;
170  s1_i = Fin[0].i - Fin[4].i;
171  s2_r = Fin[1].r + Fin[5].r;
172  s2_i = Fin[1].i + Fin[5].i;
173  s3_r = Fin[1].r - Fin[5].r;
174  s3_i = Fin[1].i - Fin[5].i;
175  s4_r = Fin[2].r + Fin[6].r;
176  s4_i = Fin[2].i + Fin[6].i;
177  s5_r = Fin[2].r - Fin[6].r;
178  s5_i = Fin[2].i - Fin[6].i;
179  s6_r = Fin[3].r + Fin[7].r;
180  s6_i = Fin[3].i + Fin[7].i;
181  s7_r = Fin[3].r - Fin[7].r;
182  s7_i = Fin[3].i - Fin[7].i;
183 
184  t0_r = s0_r - s4_r;
185  t0_i = s0_i - s4_i;
186  t1_r = s0_r + s4_r;
187  t1_i = s0_i + s4_i;
188  t2_r = s2_r + s6_r;
189  t2_i = s2_i + s6_i;
190  t3_r = s2_r - s6_r;
191  t3_i = s2_i - s6_i;
192  Fout[0].r = (t1_r + t2_r) * 0.125f;
193  Fout[0].i = (t1_i + t2_i) * 0.125f;
194  Fout[4].r = (t1_r - t2_r) * 0.125f;
195  Fout[4].i = (t1_i - t2_i) * 0.125f;
196  Fout[2].r = (t0_r - t3_i) * 0.125f;
197  Fout[2].i = (t0_i + t3_r) * 0.125f;
198  Fout[6].r = (t0_r + t3_i) * 0.125f;
199  Fout[6].i = (t0_i - t3_r) * 0.125f;
200 
201  t4_r = (s3_r - s3_i) * TW_81;
202  t4_i = (s3_r + s3_i) * TW_81;
203  t5_r = (s7_r + s7_i) * TW_81;
204  t5_i = - (s7_r - s7_i) * TW_81;
205 
206  t0_r = s1_r + s5_i;
207  t0_i = s1_i - s5_r;
208  t1_r = s1_r - s5_i;
209  t1_i = s1_i + s5_r;
210  t2_r = t4_r - t5_r;
211  t2_i = t4_i - t5_i;
212  t3_r = t4_r + t5_r;
213  t3_i = t4_i + t5_i;
214  Fout[1].r = (t1_r + t2_r) * 0.125f;
215  Fout[1].i = (t1_i + t2_i) * 0.125f;
216  Fout[5].r = (t1_r - t2_r) * 0.125f;
217  Fout[5].i = (t1_i - t2_i) * 0.125f;
218  Fout[3].r = (t0_r - t3_i) * 0.125f;
219  Fout[3].i = (t0_i + t3_r) * 0.125f;
220  Fout[7].r = (t0_r + t3_i) * 0.125f;
221  Fout[7].i = (t0_i - t3_r) * 0.125f;
222 }
223 
224 static void ne10_fft16_forward_float32_neon (ne10_fft_cpx_float32_t * Fout,
226  ne10_fft_cpx_float32_t * twiddles)
227 {
228  ne10_fft_cpx_float32_t *tw1, *tw2, *tw3;
229 
230  // the first stage
231  float32_t *p_src0, *p_src4, *p_src8, *p_src12;
232  float32x4x2_t q2_in_0123, q2_in_4567, q2_in_89ab, q2_in_cdef;
233  float32x4_t q_t0_r, q_t0_i, q_t1_r, q_t1_i, q_t2_r, q_t2_i, q_t3_r, q_t3_i;
234  float32x4_t q_out_r048c, q_out_i048c, q_out_r159d, q_out_i159d;
235  float32x4_t q_out_r26ae, q_out_i26ae, q_out_r37bf, q_out_i37bf;
236  p_src0 = (float32_t*) (& (Fin[0]));
237  p_src4 = (float32_t*) (& (Fin[4]));
238  p_src8 = (float32_t*) (& (Fin[8]));
239  p_src12 = (float32_t*) (& (Fin[12]));
240  q2_in_0123 = vld2q_f32 (p_src0);
241  q2_in_4567 = vld2q_f32 (p_src4);
242  q2_in_89ab = vld2q_f32 (p_src8);
243  q2_in_cdef = vld2q_f32 (p_src12);
244 
245  q_t2_r = vsubq_f32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
246  q_t2_i = vsubq_f32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
247  q_t3_r = vaddq_f32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
248  q_t3_i = vaddq_f32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
249 
250  q_t0_r = vaddq_f32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
251  q_t0_i = vaddq_f32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
252  q_t1_r = vsubq_f32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
253  q_t1_i = vsubq_f32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
254 
255  q_out_r26ae = vsubq_f32 (q_t3_r, q_t0_r);
256  q_out_i26ae = vsubq_f32 (q_t3_i, q_t0_i);
257  q_out_r048c = vaddq_f32 (q_t3_r, q_t0_r);
258  q_out_i048c = vaddq_f32 (q_t3_i, q_t0_i);
259  q_out_r159d = vaddq_f32 (q_t2_r, q_t1_i);
260  q_out_i159d = vsubq_f32 (q_t2_i, q_t1_r);
261  q_out_r37bf = vsubq_f32 (q_t2_r, q_t1_i);
262  q_out_i37bf = vaddq_f32 (q_t2_i, q_t1_r);
263 
264  // second stages
265  float32_t *p_dst0, *p_dst1, *p_dst2, *p_dst3;
266  float32_t *p_tw1, *p_tw2, *p_tw3;
267  float32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i;
268  float32x4_t q_s3_r, q_s3_i, q_s4_r, q_s4_i, q_s5_r, q_s5_i;
269  float32x4x2_t q2_tmp_0, q2_tmp_1, q2_tmp_2, q2_tmp_3;
270  float32x4_t q_in_r0123, q_in_r4567, q_in_r89ab, q_in_rcdef;
271  float32x4_t q_in_i0123, q_in_i4567, q_in_i89ab, q_in_icdef;
272  float32x4x2_t q2_tw1, q2_tw2, q2_tw3;
273  float32x4x2_t q2_out_0123, q2_out_4567, q2_out_89ab, q2_out_cdef;
274  tw1 = twiddles;
275  tw2 = twiddles + 4;
276  tw3 = twiddles + 8;
277  p_dst0 = (float32_t*) (&Fout[0]);
278  p_dst1 = (float32_t*) (&Fout[4]);
279  p_dst2 = (float32_t*) (&Fout[8]);
280  p_dst3 = (float32_t*) (&Fout[12]);
281  p_tw1 = (float32_t*) tw1;
282  p_tw2 = (float32_t*) tw2;
283  p_tw3 = (float32_t*) tw3;
284  q2_tmp_0 = vzipq_f32 (q_out_r048c, q_out_r159d);
285  q2_tmp_1 = vzipq_f32 (q_out_i048c, q_out_i159d);
286  q2_tmp_2 = vzipq_f32 (q_out_r26ae, q_out_r37bf);
287  q2_tmp_3 = vzipq_f32 (q_out_i26ae, q_out_i37bf);
288  q_in_r0123 = vcombine_f32 (vget_low_f32 (q2_tmp_0.val[0]), vget_low_f32 (q2_tmp_2.val[0]));
289  q_in_i0123 = vcombine_f32 (vget_low_f32 (q2_tmp_1.val[0]), vget_low_f32 (q2_tmp_3.val[0]));
290  q_in_r4567 = vcombine_f32 (vget_high_f32 (q2_tmp_0.val[0]), vget_high_f32 (q2_tmp_2.val[0]));
291  q_in_i4567 = vcombine_f32 (vget_high_f32 (q2_tmp_1.val[0]), vget_high_f32 (q2_tmp_3.val[0]));
292  q_in_r89ab = vcombine_f32 (vget_low_f32 (q2_tmp_0.val[1]), vget_low_f32 (q2_tmp_2.val[1]));
293  q_in_i89ab = vcombine_f32 (vget_low_f32 (q2_tmp_1.val[1]), vget_low_f32 (q2_tmp_3.val[1]));
294  q_in_rcdef = vcombine_f32 (vget_high_f32 (q2_tmp_0.val[1]), vget_high_f32 (q2_tmp_2.val[1]));
295  q_in_icdef = vcombine_f32 (vget_high_f32 (q2_tmp_1.val[1]), vget_high_f32 (q2_tmp_3.val[1]));
296  q2_tw1 = vld2q_f32 (p_tw1);
297  q2_tw2 = vld2q_f32 (p_tw2);
298  q2_tw3 = vld2q_f32 (p_tw3);
299 
300  q_s0_r = vmulq_f32 (q_in_r4567, q2_tw1.val[0]);
301  q_s0_i = vmulq_f32 (q_in_r4567, q2_tw1.val[1]);
302  q_s1_r = vmulq_f32 (q_in_r89ab, q2_tw2.val[0]);
303  q_s1_i = vmulq_f32 (q_in_r89ab, q2_tw2.val[1]);
304  q_s2_r = vmulq_f32 (q_in_rcdef, q2_tw3.val[0]);
305  q_s2_i = vmulq_f32 (q_in_rcdef, q2_tw3.val[1]);
306  q_s0_r = vmlsq_f32 (q_s0_r, q_in_i4567, q2_tw1.val[1]);
307  q_s0_i = vmlaq_f32 (q_s0_i, q_in_i4567, q2_tw1.val[0]);
308  q_s1_r = vmlsq_f32 (q_s1_r, q_in_i89ab, q2_tw2.val[1]);
309  q_s1_i = vmlaq_f32 (q_s1_i, q_in_i89ab, q2_tw2.val[0]);
310  q_s2_r = vmlsq_f32 (q_s2_r, q_in_icdef, q2_tw3.val[1]);
311  q_s2_i = vmlaq_f32 (q_s2_i, q_in_icdef, q2_tw3.val[0]);
312 
313 
314  q_s5_r = vsubq_f32 (q_in_r0123, q_s1_r);
315  q_s5_i = vsubq_f32 (q_in_i0123, q_s1_i);
316  q2_out_0123.val[0] = vaddq_f32 (q_in_r0123, q_s1_r);
317  q2_out_0123.val[1] = vaddq_f32 (q_in_i0123, q_s1_i);
318 
319  q_s3_r = vaddq_f32 (q_s0_r, q_s2_r);
320  q_s3_i = vaddq_f32 (q_s0_i, q_s2_i);
321  q_s4_r = vsubq_f32 (q_s0_r, q_s2_r);
322  q_s4_i = vsubq_f32 (q_s0_i, q_s2_i);
323 
324  q2_out_89ab.val[0] = vsubq_f32 (q2_out_0123.val[0], q_s3_r);
325  q2_out_89ab.val[1] = vsubq_f32 (q2_out_0123.val[1], q_s3_i);
326  q2_out_0123.val[0] = vaddq_f32 (q2_out_0123.val[0], q_s3_r);
327  q2_out_0123.val[1] = vaddq_f32 (q2_out_0123.val[1], q_s3_i);
328 
329  q2_out_4567.val[0] = vaddq_f32 (q_s5_r, q_s4_i);
330  q2_out_4567.val[1] = vsubq_f32 (q_s5_i, q_s4_r);
331  q2_out_cdef.val[0] = vsubq_f32 (q_s5_r, q_s4_i);
332  q2_out_cdef.val[1] = vaddq_f32 (q_s5_i, q_s4_r);
333 
334  vst2q_f32 (p_dst0, q2_out_0123);
335  vst2q_f32 (p_dst1, q2_out_4567);
336  vst2q_f32 (p_dst2, q2_out_89ab);
337  vst2q_f32 (p_dst3, q2_out_cdef);
338 }
339 
340 static void ne10_fft16_backward_float32_neon (ne10_fft_cpx_float32_t * Fout,
342  ne10_fft_cpx_float32_t * twiddles)
343 {
344  ne10_fft_cpx_float32_t *tw1, *tw2, *tw3;
345 
346  // the first stage
347  float32_t *p_src0, *p_src4, *p_src8, *p_src12;
348  float32x4x2_t q2_in_0123, q2_in_4567, q2_in_89ab, q2_in_cdef;
349  float32x4_t q_t0_r, q_t0_i, q_t1_r, q_t1_i, q_t2_r, q_t2_i, q_t3_r, q_t3_i;
350  float32x4_t q_out_r048c, q_out_i048c, q_out_r159d, q_out_i159d;
351  float32x4_t q_out_r26ae, q_out_i26ae, q_out_r37bf, q_out_i37bf;
352  p_src0 = (float32_t*) (& (Fin[0]));
353  p_src4 = (float32_t*) (& (Fin[4]));
354  p_src8 = (float32_t*) (& (Fin[8]));
355  p_src12 = (float32_t*) (& (Fin[12]));
356  q2_in_0123 = vld2q_f32 (p_src0);
357  q2_in_4567 = vld2q_f32 (p_src4);
358  q2_in_89ab = vld2q_f32 (p_src8);
359  q2_in_cdef = vld2q_f32 (p_src12);
360 
361  q_t2_r = vsubq_f32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
362  q_t2_i = vsubq_f32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
363  q_t3_r = vaddq_f32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
364  q_t3_i = vaddq_f32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
365 
366  q_t0_r = vaddq_f32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
367  q_t0_i = vaddq_f32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
368  q_t1_r = vsubq_f32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
369  q_t1_i = vsubq_f32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
370 
371  q_out_r26ae = vsubq_f32 (q_t3_r, q_t0_r);
372  q_out_i26ae = vsubq_f32 (q_t3_i, q_t0_i);
373  q_out_r048c = vaddq_f32 (q_t3_r, q_t0_r);
374  q_out_i048c = vaddq_f32 (q_t3_i, q_t0_i);
375  q_out_r159d = vsubq_f32 (q_t2_r, q_t1_i);
376  q_out_i159d = vaddq_f32 (q_t2_i, q_t1_r);
377  q_out_r37bf = vaddq_f32 (q_t2_r, q_t1_i);
378  q_out_i37bf = vsubq_f32 (q_t2_i, q_t1_r);
379 
380  // second stages
381  float32_t *p_dst0, *p_dst1, *p_dst2, *p_dst3;
382  float32_t *p_tw1, *p_tw2, *p_tw3;
383  float32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i;
384  float32x4_t q_s3_r, q_s3_i, q_s4_r, q_s4_i, q_s5_r, q_s5_i;
385  float32x4x2_t q2_tmp_0, q2_tmp_1, q2_tmp_2, q2_tmp_3;
386  float32x4_t q_in_r0123, q_in_r4567, q_in_r89ab, q_in_rcdef;
387  float32x4_t q_in_i0123, q_in_i4567, q_in_i89ab, q_in_icdef;
388  float32x4x2_t q2_tw1, q2_tw2, q2_tw3;
389  float32x4x2_t q2_out_0123, q2_out_4567, q2_out_89ab, q2_out_cdef;
390  float32x4_t q_one_by_nfft;
391  tw1 = twiddles;
392  tw2 = twiddles + 4;
393  tw3 = twiddles + 8;
394  p_dst0 = (float32_t*) (&Fout[0]);
395  p_dst1 = (float32_t*) (&Fout[4]);
396  p_dst2 = (float32_t*) (&Fout[8]);
397  p_dst3 = (float32_t*) (&Fout[12]);
398  p_tw1 = (float32_t*) tw1;
399  p_tw2 = (float32_t*) tw2;
400  p_tw3 = (float32_t*) tw3;
401  q2_tmp_0 = vzipq_f32 (q_out_r048c, q_out_r159d);
402  q2_tmp_1 = vzipq_f32 (q_out_i048c, q_out_i159d);
403  q2_tmp_2 = vzipq_f32 (q_out_r26ae, q_out_r37bf);
404  q2_tmp_3 = vzipq_f32 (q_out_i26ae, q_out_i37bf);
405  q_in_r0123 = vcombine_f32 (vget_low_f32 (q2_tmp_0.val[0]), vget_low_f32 (q2_tmp_2.val[0]));
406  q_in_i0123 = vcombine_f32 (vget_low_f32 (q2_tmp_1.val[0]), vget_low_f32 (q2_tmp_3.val[0]));
407  q_in_r4567 = vcombine_f32 (vget_high_f32 (q2_tmp_0.val[0]), vget_high_f32 (q2_tmp_2.val[0]));
408  q_in_i4567 = vcombine_f32 (vget_high_f32 (q2_tmp_1.val[0]), vget_high_f32 (q2_tmp_3.val[0]));
409  q_in_r89ab = vcombine_f32 (vget_low_f32 (q2_tmp_0.val[1]), vget_low_f32 (q2_tmp_2.val[1]));
410  q_in_i89ab = vcombine_f32 (vget_low_f32 (q2_tmp_1.val[1]), vget_low_f32 (q2_tmp_3.val[1]));
411  q_in_rcdef = vcombine_f32 (vget_high_f32 (q2_tmp_0.val[1]), vget_high_f32 (q2_tmp_2.val[1]));
412  q_in_icdef = vcombine_f32 (vget_high_f32 (q2_tmp_1.val[1]), vget_high_f32 (q2_tmp_3.val[1]));
413  q2_tw1 = vld2q_f32 (p_tw1);
414  q2_tw2 = vld2q_f32 (p_tw2);
415  q2_tw3 = vld2q_f32 (p_tw3);
416 
417  q_s0_r = vmulq_f32 (q_in_r4567, q2_tw1.val[0]);
418  q_s0_i = vmulq_f32 (q_in_i4567, q2_tw1.val[0]);
419  q_s1_r = vmulq_f32 (q_in_r89ab, q2_tw2.val[0]);
420  q_s1_i = vmulq_f32 (q_in_i89ab, q2_tw2.val[0]);
421  q_s2_r = vmulq_f32 (q_in_rcdef, q2_tw3.val[0]);
422  q_s2_i = vmulq_f32 (q_in_icdef, q2_tw3.val[0]);
423  q_s0_r = vmlaq_f32 (q_s0_r, q_in_i4567, q2_tw1.val[1]);
424  q_s0_i = vmlsq_f32 (q_s0_i, q_in_r4567, q2_tw1.val[1]);
425  q_s1_r = vmlaq_f32 (q_s1_r, q_in_i89ab, q2_tw2.val[1]);
426  q_s1_i = vmlsq_f32 (q_s1_i, q_in_r89ab, q2_tw2.val[1]);
427  q_s2_r = vmlaq_f32 (q_s2_r, q_in_icdef, q2_tw3.val[1]);
428  q_s2_i = vmlsq_f32 (q_s2_i, q_in_rcdef, q2_tw3.val[1]);
429 
430  q_s5_r = vsubq_f32 (q_in_r0123, q_s1_r);
431  q_s5_i = vsubq_f32 (q_in_i0123, q_s1_i);
432  q2_out_0123.val[0] = vaddq_f32 (q_in_r0123, q_s1_r);
433  q2_out_0123.val[1] = vaddq_f32 (q_in_i0123, q_s1_i);
434 
435  q_s3_r = vaddq_f32 (q_s0_r, q_s2_r);
436  q_s3_i = vaddq_f32 (q_s0_i, q_s2_i);
437  q_s4_r = vsubq_f32 (q_s0_r, q_s2_r);
438  q_s4_i = vsubq_f32 (q_s0_i, q_s2_i);
439 
440  q_one_by_nfft = vdupq_n_f32 (0.0625f);
441  q2_out_89ab.val[0] = vsubq_f32 (q2_out_0123.val[0], q_s3_r);
442  q2_out_89ab.val[1] = vsubq_f32 (q2_out_0123.val[1], q_s3_i);
443  q2_out_0123.val[0] = vaddq_f32 (q2_out_0123.val[0], q_s3_r);
444  q2_out_0123.val[1] = vaddq_f32 (q2_out_0123.val[1], q_s3_i);
445 
446  q2_out_4567.val[0] = vsubq_f32 (q_s5_r, q_s4_i);
447  q2_out_4567.val[1] = vaddq_f32 (q_s5_i, q_s4_r);
448  q2_out_cdef.val[0] = vaddq_f32 (q_s5_r, q_s4_i);
449  q2_out_cdef.val[1] = vsubq_f32 (q_s5_i, q_s4_r);
450 
451  q2_out_89ab.val[0] = vmulq_f32 (q2_out_89ab.val[0], q_one_by_nfft);
452  q2_out_89ab.val[1] = vmulq_f32 (q2_out_89ab.val[1], q_one_by_nfft);
453  q2_out_0123.val[0] = vmulq_f32 (q2_out_0123.val[0], q_one_by_nfft);
454  q2_out_0123.val[1] = vmulq_f32 (q2_out_0123.val[1], q_one_by_nfft);
455  q2_out_4567.val[0] = vmulq_f32 (q2_out_4567.val[0], q_one_by_nfft);
456  q2_out_4567.val[1] = vmulq_f32 (q2_out_4567.val[1], q_one_by_nfft);
457  q2_out_cdef.val[0] = vmulq_f32 (q2_out_cdef.val[0], q_one_by_nfft);
458  q2_out_cdef.val[1] = vmulq_f32 (q2_out_cdef.val[1], q_one_by_nfft);
459 
460  vst2q_f32 (p_dst0, q2_out_0123);
461  vst2q_f32 (p_dst1, q2_out_4567);
462  vst2q_f32 (p_dst2, q2_out_89ab);
463  vst2q_f32 (p_dst3, q2_out_cdef);
464 }
465 
466 static void ne10_fft_split_r2c_1d_float32_neon (ne10_fft_cpx_float32_t *dst,
467  const ne10_fft_cpx_float32_t *src,
468  ne10_fft_cpx_float32_t *twiddles,
469  ne10_int32_t ncfft)
470 {
471  ne10_int32_t k;
472  ne10_int32_t count = ncfft / 2;
473  ne10_fft_cpx_float32_t fpnk, fpk, f1k, f2k, tw, tdc;
474  float32x4x2_t q2_fpk, q2_fpnk, q2_tw, q2_dst, q2_dst2;
475  float32x4_t q_fpnk_r, q_fpnk_i;
476  float32x4_t q_f1k_r, q_f1k_i, q_f2k_r, q_f2k_i;
477  float32x4_t q_tw_r, q_tw_i;
478  float32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3, q_val;
479  float32x4_t q_dst_r, q_dst_i, q_dst2_r, q_dst2_i;
480  float32_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
481 
482  tdc.r = src[0].r;
483  tdc.i = src[0].i;
484 
485  dst[0].r = tdc.r + tdc.i;
486  dst[ncfft].r = tdc.r - tdc.i;
487  dst[ncfft].i = dst[0].i = 0;
488 
489  if (count >= 4)
490  {
491  for (k = 1; k <= count ; k += 4)
492  {
493  p_src = (float32_t*) (& (src[k]));
494  p_src2 = (float32_t*) (& (src[ncfft - k - 3]));
495  p_twiddles = (float32_t*) (& (twiddles[k - 1]));
496  p_dst = (float32_t*) (& (dst[k]));
497  p_dst2 = (float32_t*) (& (dst[ncfft - k - 3]));
498 
499  q2_fpk = vld2q_f32 (p_src);
500  q2_fpnk = vld2q_f32 (p_src2);
501  q2_tw = vld2q_f32 (p_twiddles);
502  q2_fpnk.val[0] = vrev64q_f32 (q2_fpnk.val[0]);
503  q2_fpnk.val[1] = vrev64q_f32 (q2_fpnk.val[1]);
504  q_fpnk_r = vcombine_f32 (vget_high_f32 (q2_fpnk.val[0]), vget_low_f32 (q2_fpnk.val[0]));
505  q_fpnk_i = vcombine_f32 (vget_high_f32 (q2_fpnk.val[1]), vget_low_f32 (q2_fpnk.val[1]));
506  q_fpnk_i = vnegq_f32 (q_fpnk_i);
507 
508  q_f1k_r = vaddq_f32 (q2_fpk.val[0], q_fpnk_r);
509  q_f1k_i = vaddq_f32 (q2_fpk.val[1], q_fpnk_i);
510 
511  q_f2k_r = vsubq_f32 (q2_fpk.val[0], q_fpnk_r);
512  q_f2k_i = vsubq_f32 (q2_fpk.val[1], q_fpnk_i);
513 
514  q_tmp0 = vmulq_f32 (q_f2k_r, q2_tw.val[0]);
515  q_tmp1 = vmulq_f32 (q_f2k_i, q2_tw.val[1]);
516  q_tmp2 = vmulq_f32 (q_f2k_r, q2_tw.val[1]);
517  q_tmp3 = vmulq_f32 (q_f2k_i, q2_tw.val[0]);
518  q_tw_r = vsubq_f32 (q_tmp0, q_tmp1);
519  q_tw_i = vaddq_f32 (q_tmp2, q_tmp3);
520 
521  q_val = vdupq_n_f32 (0.5f);
522  q_dst2_r = vsubq_f32 (q_f1k_r, q_tw_r);
523  q_dst2_i = vsubq_f32 (q_tw_i, q_f1k_i);
524  q_dst_r = vaddq_f32 (q_f1k_r, q_tw_r);
525  q_dst_i = vaddq_f32 (q_f1k_i, q_tw_i);
526  q_dst2_r = vmulq_f32 (q_dst2_r, q_val);
527  q_dst2_i = vmulq_f32 (q_dst2_i, q_val);
528  q2_dst.val[0] = vmulq_f32 (q_dst_r, q_val);
529  q2_dst.val[1] = vmulq_f32 (q_dst_i, q_val);
530  q_dst2_r = vrev64q_f32 (q_dst2_r);
531  q_dst2_i = vrev64q_f32 (q_dst2_i);
532  q2_dst2.val[0] = vcombine_f32 (vget_high_f32 (q_dst2_r), vget_low_f32 (q_dst2_r));
533  q2_dst2.val[1] = vcombine_f32 (vget_high_f32 (q_dst2_i), vget_low_f32 (q_dst2_i));
534  vst2q_f32 (p_dst, q2_dst);
535  vst2q_f32 (p_dst2, q2_dst2);
536 
537  }
538  }
539  else
540  {
541  for (k = 1; k <= count ; k++)
542  {
543  fpk = src[k];
544  fpnk.r = src[ncfft - k].r;
545  fpnk.i = - src[ncfft - k].i;
546 
547  f1k.r = fpk.r + fpnk.r;
548  f1k.i = fpk.i + fpnk.i;
549 
550  f2k.r = fpk.r - fpnk.r;
551  f2k.i = fpk.i - fpnk.i;
552 
553  tw.r = f2k.r * (twiddles[k - 1]).r - f2k.i * (twiddles[k - 1]).i;
554  tw.i = f2k.r * (twiddles[k - 1]).i + f2k.i * (twiddles[k - 1]).r;
555 
556  dst[k].r = (f1k.r + tw.r) * 0.5f;
557  dst[k].i = (f1k.i + tw.i) * 0.5f;
558  dst[ncfft - k].r = (f1k.r - tw.r) * 0.5f;
559  dst[ncfft - k].i = (tw.i - f1k.i) * 0.5f;
560  }
561  }
562 }
563 
564 static void ne10_fft_split_c2r_1d_float32_neon (ne10_fft_cpx_float32_t *dst,
565  const ne10_fft_cpx_float32_t *src,
566  ne10_fft_cpx_float32_t *twiddles,
567  ne10_int32_t ncfft)
568 {
569 
570  ne10_int32_t k;
571  ne10_int32_t count = ncfft / 2;
572  ne10_fft_cpx_float32_t fk, fnkc, fek, fok, tmp;
573  float32x4x2_t q2_fk, q2_fnkc, q2_tw, q2_dst, q2_dst2;
574  float32x4_t q_fnkc_r, q_fnkc_i;
575  float32x4_t q_fek_r, q_fek_i, q_fok_r, q_fok_i;
576  float32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3, q_val;
577  float32x4_t q_dst2_r, q_dst2_i;
578  float32_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
579 
580  dst[0].r = (src[0].r + src[ncfft].r) * 0.5f;
581  dst[0].i = (src[0].r - src[ncfft].r) * 0.5f;
582 
583  if (count >= 4)
584  {
585  for (k = 1; k <= count ; k += 4)
586  {
587  p_src = (float32_t*) (& (src[k]));
588  p_src2 = (float32_t*) (& (src[ncfft - k - 3]));
589  p_twiddles = (float32_t*) (& (twiddles[k - 1]));
590  p_dst = (float32_t*) (& (dst[k]));
591  p_dst2 = (float32_t*) (& (dst[ncfft - k - 3]));
592 
593  q2_fk = vld2q_f32 (p_src);
594  q2_fnkc = vld2q_f32 (p_src2);
595  q2_tw = vld2q_f32 (p_twiddles);
596  q2_fnkc.val[0] = vrev64q_f32 (q2_fnkc.val[0]);
597  q2_fnkc.val[1] = vrev64q_f32 (q2_fnkc.val[1]);
598  q_fnkc_r = vcombine_f32 (vget_high_f32 (q2_fnkc.val[0]), vget_low_f32 (q2_fnkc.val[0]));
599  q_fnkc_i = vcombine_f32 (vget_high_f32 (q2_fnkc.val[1]), vget_low_f32 (q2_fnkc.val[1]));
600  q_fnkc_i = vnegq_f32 (q_fnkc_i);
601 
602  q_fek_r = vaddq_f32 (q2_fk.val[0], q_fnkc_r);
603  q_fek_i = vaddq_f32 (q2_fk.val[1], q_fnkc_i);
604 
605  q_tmp0 = vsubq_f32 (q2_fk.val[0], q_fnkc_r);
606  q_tmp1 = vsubq_f32 (q2_fk.val[1], q_fnkc_i);
607 
608  q_fok_r = vmulq_f32 (q_tmp0, q2_tw.val[0]);
609  q_fok_i = vmulq_f32 (q_tmp1, q2_tw.val[0]);
610  q_tmp2 = vmulq_f32 (q_tmp1, q2_tw.val[1]);
611  q_tmp3 = vmulq_f32 (q_tmp0, q2_tw.val[1]);
612  q_fok_r = vaddq_f32 (q_fok_r, q_tmp2);
613  q_fok_i = vsubq_f32 (q_fok_i, q_tmp3);
614 
615  q_val = vdupq_n_f32 (0.5f);
616  q_dst2_r = vsubq_f32 (q_fek_r, q_fok_r);
617  q_dst2_i = vsubq_f32 (q_fok_i, q_fek_i);
618  q2_dst.val[0] = vaddq_f32 (q_fek_r, q_fok_r);
619  q2_dst.val[1] = vaddq_f32 (q_fek_i, q_fok_i);
620  q_dst2_r = vmulq_f32 (q_dst2_r, q_val);
621  q_dst2_i = vmulq_f32 (q_dst2_i, q_val);
622  q2_dst.val[0] = vmulq_f32 (q2_dst.val[0], q_val);
623  q2_dst.val[1] = vmulq_f32 (q2_dst.val[1], q_val);
624  q_dst2_r = vrev64q_f32 (q_dst2_r);
625  q_dst2_i = vrev64q_f32 (q_dst2_i);
626  q2_dst2.val[0] = vcombine_f32 (vget_high_f32 (q_dst2_r), vget_low_f32 (q_dst2_r));
627  q2_dst2.val[1] = vcombine_f32 (vget_high_f32 (q_dst2_i), vget_low_f32 (q_dst2_i));
628  vst2q_f32 (p_dst, q2_dst);
629  vst2q_f32 (p_dst2, q2_dst2);
630 
631  }
632  }
633  else
634  {
635  for (k = 1; k <= count ; k++)
636  {
637  fk = src[k];
638  fnkc.r = src[ncfft - k].r;
639  fnkc.i = -src[ncfft - k].i;
640 
641  fek.r = fk.r + fnkc.r;
642  fek.i = fk.i + fnkc.i;
643 
644  tmp.r = fk.r - fnkc.r;
645  tmp.i = fk.i - fnkc.i;
646 
647  fok.r = tmp.r * twiddles[k - 1].r + tmp.i * twiddles[k - 1].i;
648  fok.i = tmp.i * twiddles[k - 1].r - tmp.r * twiddles[k - 1].i;
649 
650  dst[k].r = (fek.r + fok.r) * 0.5f;
651  dst[k].i = (fek.i + fok.i) * 0.5f;
652 
653  dst[ncfft - k].r = (fek.r - fok.r) * 0.5f;
654  dst[ncfft - k].i = (fok.i - fek.i) * 0.5f;
655  }
656  }
657 }
658 
668  ne10_int32_t inverse_fft)
669 {
670  // For inputs shorter than 15 samples in length, fall back to the C version. We are
671  // unlikely to see an improvement from NEON in these cases.
672  if (cfg->nfft < 15)
673  {
674  ne10_fft_c2c_1d_float32_c (fout, fin, cfg, inverse_fft);
675  return;
676  }
677 
678  ne10_int32_t stage_count = cfg->factors[0];
679  ne10_int32_t algorithm_flag = cfg->factors[2 * (stage_count + 1)];
680 
681  assert ((algorithm_flag == NE10_FFT_ALG_DEFAULT)
682  || (algorithm_flag == NE10_FFT_ALG_ANY));
683 
684  // For NE10_FFT_ALG_ANY.
685  // Function will return inside this branch.
686  if (algorithm_flag == NE10_FFT_ALG_ANY)
687  {
688  if (inverse_fft)
689  {
691  cfg->factors, cfg->twiddles, cfg->buffer, cfg->is_backward_scaled);
692  }
693  else
694  {
696  cfg->factors, cfg->twiddles, cfg->buffer, cfg->is_forward_scaled);
697  }
698  return;
699  }
700 
701  // Since function goes pass assertion and skips branch above, algorithm_flag
702  // must be NE10_FFT_ALG_DEFAULT.
703  if (inverse_fft)
704  {
705  switch (cfg->nfft)
706  {
707  case 4:
708  ne10_fft4_backward_float32 (fout, fin);
709  break;
710  case 8:
711  ne10_fft8_backward_float32 (fout, fin);
712  break;
713  case 16:
714  ne10_fft16_backward_float32_neon (fout, fin, cfg->twiddles);
715  break;
716  default:
718  break;
719  }
720  }
721  else
722  {
723  switch (cfg->nfft)
724  {
725  case 4:
726  ne10_fft4_forward_float32 (fout, fin);
727  break;
728  case 8:
729  ne10_fft8_forward_float32 (fout, fin);
730  break;
731  case 16:
732  ne10_fft16_forward_float32_neon (fout, fin, cfg->twiddles);
733  break;
734  default:
736  break;
737  }
738  }
739 }
740 
746  ne10_float32_t *fin,
748 {
749  ne10_fft_cpx_float32_t * tmpbuf1 = cfg->buffer;
750  ne10_fft_cpx_float32_t * tmpbuf2 = cfg->buffer + cfg->ncfft;
751  ne10_fft_state_float32_t c2c_state;
752 
753  c2c_state.nfft = cfg->ncfft;
754  c2c_state.factors = cfg->factors;
755  c2c_state.twiddles = cfg->twiddles;
756  c2c_state.buffer = tmpbuf2;
757 
758  ne10_fft_c2c_1d_float32_neon (tmpbuf1, (ne10_fft_cpx_float32_t*) fin, &c2c_state, 0);
759  ne10_fft_split_r2c_1d_float32_neon (fout, tmpbuf1, cfg->super_twiddles, cfg->ncfft);
760 }
761 
769 {
770  ne10_fft_cpx_float32_t * tmpbuf1 = cfg->buffer;
771  ne10_fft_cpx_float32_t * tmpbuf2 = cfg->buffer + cfg->ncfft;
772  ne10_fft_state_float32_t c2c_state;
773 
774  c2c_state.nfft = cfg->ncfft;
775  c2c_state.factors = cfg->factors;
776  c2c_state.twiddles = cfg->twiddles;
777  c2c_state.buffer = tmpbuf2;
778 
779  ne10_fft_split_c2r_1d_float32_neon (tmpbuf1, fin, cfg->super_twiddles, cfg->ncfft);
780  ne10_fft_c2c_1d_float32_neon ( (ne10_fft_cpx_float32_t*) fout, tmpbuf1, &c2c_state, 1);
781 }
#define NE10_FFT_ALG_DEFAULT
Definition: NE10_fft.h:57
ne10_int32_t is_backward_scaled
Flag to control scaling behaviour in backward floating point complex FFT.
Definition: NE10_types.h:261
void ne10_mixed_radix_generic_butterfly_float32_neon(ne10_fft_cpx_float32_t *Fout, const ne10_fft_cpx_float32_t *Fin, const ne10_int32_t *factors, const ne10_fft_cpx_float32_t *twiddles, ne10_fft_cpx_float32_t *buffer, const ne10_int32_t is_scaled)
int32_t ne10_int32_t
Definition: NE10_types.h:76
void ne10_fft_c2c_1d_float32_neon(ne10_fft_cpx_float32_t *fout, ne10_fft_cpx_float32_t *fin, ne10_fft_cfg_float32_t cfg, ne10_int32_t inverse_fft)
Specific implementation of ne10_fft_c2c_1d_float32 using NEON SIMD capabilities.
float ne10_float32_t
Definition: NE10_types.h:80
Structure for the floating point FFT state.
Definition: NE10_types.h:239
ne10_int32_t * factors
Definition: NE10_types.h:242
void ne10_mixed_radix_fft_backward_float32_neon(ne10_fft_cpx_float32_t *Fout, ne10_fft_cpx_float32_t *Fin, ne10_int32_t *factors, ne10_fft_cpx_float32_t *twiddles, ne10_fft_cpx_float32_t *buffer) asm("ne10_mixed_radix_fft_backward_float32_neon")
ne10_fft_cpx_float32_t * twiddles
Definition: NE10_types.h:275
ne10_fft_cpx_float32_t * twiddles
Definition: NE10_types.h:243
void ne10_mixed_radix_generic_butterfly_inverse_float32_neon(ne10_fft_cpx_float32_t *Fout, const ne10_fft_cpx_float32_t *Fin, const ne10_int32_t *factors, const ne10_fft_cpx_float32_t *twiddles, ne10_fft_cpx_float32_t *buffer, const ne10_int32_t is_scaled)
ne10_fft_cpx_float32_t * super_twiddles
Definition: NE10_types.h:276
void ne10_fft_c2c_1d_float32_c(ne10_fft_cpx_float32_t *fout, ne10_fft_cpx_float32_t *fin, ne10_fft_cfg_float32_t cfg, ne10_int32_t inverse_fft)
Specific implementation of ne10_fft_c2c_1d_float32 using plain C.
#define NE10_FFT_ALG_ANY
Definition: NE10_fft.h:58
void ne10_fft_c2r_1d_float32_neon(ne10_float32_t *fout, ne10_fft_cpx_float32_t *fin, ne10_fft_r2c_cfg_float32_t cfg)
Specific implementation of ne10_fft_c2r_1d_float32 using NEON SIMD capabilities.
ne10_float32_t i
Definition: NE10_types.h:233
ne10_fft_cpx_float32_t * buffer
Definition: NE10_types.h:271
ne10_fft_cpx_float32_t * buffer
Definition: NE10_types.h:244
ne10_int32_t is_forward_scaled
Flag to control scaling behaviour in forward floating point complex FFT.
Definition: NE10_types.h:253
void ne10_mixed_radix_fft_forward_float32_neon(ne10_fft_cpx_float32_t *Fout, ne10_fft_cpx_float32_t *Fin, ne10_int32_t *factors, ne10_fft_cpx_float32_t *twiddles, ne10_fft_cpx_float32_t *buffer) asm("ne10_mixed_radix_fft_forward_float32_neon")
ne10_float32_t r
Definition: NE10_types.h:232
void ne10_fft_r2c_1d_float32_neon(ne10_fft_cpx_float32_t *fout, ne10_float32_t *fin, ne10_fft_r2c_cfg_float32_t cfg)
Specific implementation of ne10_fft_r2c_1d_float32 using NEON SIMD capabilities.