Project Ne10
An open, optimized software library for the ARM architecture.
NE10_rfft_float32.c
Go to the documentation of this file.
1 /*
2  * Copyright 2014-16 ARM Limited and Contributors.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of ARM Limited nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /* license of Kiss FFT */
29 /*
30 Copyright (c) 2003-2010, Mark Borgerding
31 
32 All rights reserved.
33 
34 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
35 
36  * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
37  * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
38  * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
39 
40 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 */
42 
43 /*
44  * NE10 Library : dsp/NE10_rfft_float32.c
45  */
46 
47 #include "NE10_types.h"
48 #include "NE10_macros.h"
49 #include "NE10_fft.h"
50 #include "NE10_dsp.h"
51 
52 #if (NE10_UNROLL_LEVEL > 0)
53 
54 void ne10_radix8_r2c_c (ne10_fft_cpx_float32_t *Fout,
55  const ne10_fft_cpx_float32_t *Fin,
56  const ne10_int32_t fstride,
57  const ne10_int32_t mstride,
58  const ne10_int32_t nfft)
59 {
60  const ne10_int32_t in_step = nfft >> 3;
61  ne10_int32_t f_count;
62 
63  ne10_float32_t scratch_in[8];
64  ne10_float32_t scratch [4];
65 
66  /* real pointers */
67  const ne10_float32_t* Fin_r = (ne10_float32_t*) Fin;
68  ne10_float32_t* Fout_r = (ne10_float32_t*) Fout;
69  Fout_r ++; // always leave the first real empty
70 
71  for (f_count = fstride; f_count; f_count --)
72  {
73  scratch_in[0] = Fin_r[in_step * 0] + Fin_r[in_step * (0 + 4)];
74  scratch_in[1] = Fin_r[in_step * 0] - Fin_r[in_step * (0 + 4)];
75  scratch_in[2] = Fin_r[in_step * 1] + Fin_r[in_step * (1 + 4)];
76  scratch_in[3] = Fin_r[in_step * 1] - Fin_r[in_step * (1 + 4)];
77  scratch_in[4] = Fin_r[in_step * 2] + Fin_r[in_step * (2 + 4)];
78  scratch_in[5] = Fin_r[in_step * 2] - Fin_r[in_step * (2 + 4)];
79  scratch_in[6] = Fin_r[in_step * 3] + Fin_r[in_step * (3 + 4)];
80  scratch_in[7] = Fin_r[in_step * 3] - Fin_r[in_step * (3 + 4)];
81 
82  scratch_in[3] *= TW_81_F32;
83  scratch_in[7] *= TW_81N_F32;
84 
85  // radix 2 butterfly
86  scratch[0] = scratch_in[0] + scratch_in[4];
87  scratch[1] = scratch_in[2] + scratch_in[6];
88  scratch[2] = scratch_in[7] - scratch_in[3];
89  scratch[3] = scratch_in[3] + scratch_in[7];
90 
91  Fout_r[0] = scratch [0] + scratch [1];
92  Fout_r[7] = scratch [0] - scratch [1];
93 
94  Fout_r[1] = scratch_in[1] + scratch [3];
95  Fout_r[5] = scratch_in[1] - scratch [3];
96 
97  Fout_r[2] = scratch [2] - scratch_in[5];
98  Fout_r[6] = scratch [2] + scratch_in[5];
99 
100  Fout_r[3] = scratch_in[0] - scratch_in[4];
101 
102  Fout_r[4] = scratch_in[6] - scratch_in[2];
103 
104  Fin_r ++;
105  Fout_r += 8;
106  }
107 }
108 
109 void ne10_radix8_c2r_c (ne10_fft_cpx_float32_t *Fout,
110  const ne10_fft_cpx_float32_t *Fin,
111  const ne10_int32_t fstride,
112  const ne10_int32_t mstride,
113  const ne10_int32_t nfft)
114 {
115  const ne10_int32_t in_step = nfft >> 3;
116  ne10_int32_t f_count;
117 
118  ne10_float32_t scratch_in[8];
119 
120  const ne10_float32_t one_by_N = 1.0 / nfft;
121 
122  /* real pointers */
123  const ne10_float32_t* Fin_r = (ne10_float32_t*) Fin;
124  ne10_float32_t* Fout_r = (ne10_float32_t*) Fout;
125 
126  for (f_count = fstride; f_count; f_count --)
127  {
128  scratch_in[0] = Fin_r[0] + Fin_r[3] + Fin_r[3] + Fin_r[7];
129  scratch_in[1] = Fin_r[1] + Fin_r[1] + Fin_r[5] + Fin_r[5];
130  scratch_in[2] = Fin_r[0] - Fin_r[4] - Fin_r[4] - Fin_r[7];
131  scratch_in[3] = Fin_r[1] - Fin_r[2] - Fin_r[5] - Fin_r[6];
132  scratch_in[4] = Fin_r[0] - Fin_r[3] - Fin_r[3] + Fin_r[7];
133  scratch_in[5] = - Fin_r[2] - Fin_r[2] + Fin_r[6] + Fin_r[6];
134  scratch_in[6] = Fin_r[0] + Fin_r[4] + Fin_r[4] - Fin_r[7];
135  scratch_in[7] = Fin_r[1] + Fin_r[2] - Fin_r[5] + Fin_r[6];
136 
137  scratch_in[3] /= TW_81_F32;
138  scratch_in[7] /= TW_81N_F32;
139 
140  Fout_r[0 * in_step] = scratch_in[0] + scratch_in[1];
141  Fout_r[4 * in_step] = scratch_in[0] - scratch_in[1];
142  Fout_r[1 * in_step] = scratch_in[2] + scratch_in[3];
143  Fout_r[5 * in_step] = scratch_in[2] - scratch_in[3];
144  Fout_r[2 * in_step] = scratch_in[4] + scratch_in[5];
145  Fout_r[6 * in_step] = scratch_in[4] - scratch_in[5];
146  Fout_r[3 * in_step] = scratch_in[6] + scratch_in[7];
147  Fout_r[7 * in_step] = scratch_in[6] - scratch_in[7];
148 
149 #if defined(NE10_DSP_RFFT_SCALING)
150  Fout_r[0 * in_step] *= one_by_N;
151  Fout_r[4 * in_step] *= one_by_N;
152  Fout_r[1 * in_step] *= one_by_N;
153  Fout_r[5 * in_step] *= one_by_N;
154  Fout_r[2 * in_step] *= one_by_N;
155  Fout_r[6 * in_step] *= one_by_N;
156  Fout_r[3 * in_step] *= one_by_N;
157  Fout_r[7 * in_step] *= one_by_N;
158 #endif
159 
160  Fin_r += 8;
161  Fout_r ++;
162  }
163 }
164 
165 void ne10_radix4_r2c_c (ne10_fft_cpx_float32_t *Fout,
166  const ne10_fft_cpx_float32_t *Fin,
167  const ne10_int32_t fstride,
168  const ne10_int32_t mstride,
169  const ne10_int32_t nfft)
170 {
171  const ne10_int32_t in_step = nfft >> 2;
172  ne10_int32_t f_count;
173 
174  ne10_float32_t scratch_in [4];
175  ne10_float32_t scratch_out[4];
176 
177  /* real pointers */
178  const ne10_float32_t *Fin_r = (ne10_float32_t*) Fin;
179  ne10_float32_t *Fout_r = (ne10_float32_t*) Fout;
180  Fout_r ++; // always leave the first real empty
181 
182  for (f_count = fstride; f_count; f_count --)
183  {
184  scratch_in[0] = Fin_r[0 * in_step];
185  scratch_in[1] = Fin_r[1 * in_step];
186  scratch_in[2] = Fin_r[2 * in_step];
187  scratch_in[3] = Fin_r[3 * in_step];
188 
189  // NE10_PRINT_Q_VECTOR(scratch_in);
190 
191  NE10_FFT_R2C_4R_RCR(scratch_out,scratch_in);
192 
193  // NE10_PRINT_Q_VECTOR(scratch_out);
194 
195  Fout_r[0] = scratch_out[0];
196  Fout_r[1] = scratch_out[1];
197  Fout_r[2] = scratch_out[2];
198  Fout_r[3] = scratch_out[3];
199 
200  Fin_r ++;
201  Fout_r += 4;
202  }
203 }
204 
205 void ne10_radix4_c2r_c (ne10_fft_cpx_float32_t *Fout,
206  const ne10_fft_cpx_float32_t *Fin,
207  const ne10_int32_t fstride,
208  const ne10_int32_t mstride,
209  const ne10_int32_t nfft)
210 {
211  ne10_int32_t f_count;
212  const ne10_int32_t in_step = nfft >> 2;
213  ne10_float32_t scratch_in [4];
214  ne10_float32_t scratch_out[4];
215 
216  const ne10_float32_t one_by_N = 1.0 / nfft;
217 
218  /* real pointers */
219  const ne10_float32_t *Fin_r = (ne10_float32_t*) Fin;
220  ne10_float32_t *Fout_r = (ne10_float32_t*) Fout;
221 
222  for (f_count = fstride; f_count; f_count --)
223  {
224  scratch_in[0] = Fin_r[0];
225  scratch_in[1] = Fin_r[1];
226  scratch_in[2] = Fin_r[2];
227  scratch_in[3] = Fin_r[3];
228 
229  // NE10_PRINT_Q_VECTOR(scratch_in);
230 
231  NE10_FFT_C2R_RCR_4R(scratch_out,scratch_in);
232 
233  // NE10_PRINT_Q_VECTOR(scratch_out);
234 
235 #if defined(NE10_DSP_RFFT_SCALING)
236  scratch_out[0] *= one_by_N;
237  scratch_out[1] *= one_by_N;
238  scratch_out[2] *= one_by_N;
239  scratch_out[3] *= one_by_N;
240 #endif
241 
242  // store
243  Fout_r[0 * in_step] = scratch_out[0];
244  Fout_r[1 * in_step] = scratch_out[1];
245  Fout_r[2 * in_step] = scratch_out[2];
246  Fout_r[3 * in_step] = scratch_out[3];
247 
248  Fin_r += 4;
249  Fout_r ++;
250  }
251 }
252 
253 void ne10_radix2_r2c_c (ne10_fft_cpx_float32_t *Fout,
254  const ne10_fft_cpx_float32_t *Fin)
255 {
256  const ne10_float32_t *Fin_r = (ne10_float32_t*) Fin;
257  ne10_float32_t *Fout_r = (ne10_float32_t*) Fout;
258  Fout_r ++; // always leave the first real empty
259 
260  Fout_r[0] = Fin_r[0] + Fin_r[1];
261  Fout_r[1] = Fin_r[0] - Fin_r[1];
262 }
263 
264 void ne10_radix2_c2r_c (ne10_fft_cpx_float32_t *Fout,
265  const ne10_fft_cpx_float32_t *Fin)
266 {
267  const ne10_float32_t *Fin_r = (ne10_float32_t*) Fin;
268  ne10_float32_t *Fout_r = (ne10_float32_t*) Fout;
269 
270  Fout_r[0] = Fin_r[0] + Fin_r[1];
271  Fout_r[1] = Fin_r[0] - Fin_r[1];
272 #if defined(NE10_DSP_RFFT_SCALING)
273  Fout_r[0] *= 0.5;
274  Fout_r[1] *= 0.5;
275 #endif
276 }
277 
278 NE10_INLINE void ne10_radix4_r2c_with_twiddles_first_butterfly_c (ne10_float32_t *Fout_r,
279  const ne10_float32_t *Fin_r,
280  const ne10_int32_t out_step,
281  const ne10_int32_t in_step,
282  const ne10_fft_cpx_float32_t *twiddles)
283 {
284  ne10_float32_t scratch_out[4];
285  ne10_float32_t scratch_in [4];
286 
287  // load
288  scratch_in[0] = Fin_r[0 * in_step];
289  scratch_in[1] = Fin_r[1 * in_step];
290  scratch_in[2] = Fin_r[2 * in_step];
291  scratch_in[3] = Fin_r[3 * in_step];
292 
293  // NE10_PRINT_Q_VECTOR(scratch_in);
294 
295  NE10_FFT_R2C_4R_RCR(scratch_out,scratch_in);
296 
297  // NE10_PRINT_Q_VECTOR(scratch_out);
298 
299  // store
300  Fout_r[ 0] = scratch_out[0];
301  Fout_r[ (out_step << 1) - 1] = scratch_out[1];
302  Fout_r[ (out_step << 1) ] = scratch_out[2];
303  Fout_r[2 * (out_step << 1) - 1] = scratch_out[3];
304 }
305 
306 NE10_INLINE void ne10_radix4_c2r_with_twiddles_first_butterfly_c (ne10_float32_t *Fout_r,
307  const ne10_float32_t *Fin_r,
308  const ne10_int32_t out_step,
309  const ne10_int32_t in_step,
310  const ne10_fft_cpx_float32_t *twiddles)
311 {
312  ne10_float32_t scratch [8];
313  ne10_float32_t scratch_in_r [4];
314  ne10_float32_t scratch_out_r[4];
315 
316  // load
317  scratch_in_r[0] = Fin_r[0 ];
318  scratch_in_r[1] = Fin_r[1*(out_step<<1)-1];
319  scratch_in_r[2] = Fin_r[1*(out_step<<1) ];
320  scratch_in_r[3] = Fin_r[2*(out_step<<1)-1];
321 
322  // NE10_PRINT_Q_VECTOR(scratch_in_r);
323 
324  // radix 4 butterfly without twiddles
325  scratch[0] = scratch_in_r[0] + scratch_in_r[3];
326  scratch[1] = scratch_in_r[0] - scratch_in_r[3];
327  scratch[2] = scratch_in_r[1] + scratch_in_r[1];
328  scratch[3] = scratch_in_r[2] + scratch_in_r[2];
329 
330  scratch_out_r[0] = scratch[0] + scratch[2];
331  scratch_out_r[1] = scratch[1] - scratch[3];
332  scratch_out_r[2] = scratch[0] - scratch[2];
333  scratch_out_r[3] = scratch[1] + scratch[3];
334 
335  // NE10_PRINT_Q_VECTOR(scratch_out_r);
336 
337  // store
338  Fout_r[0 * in_step] = scratch_out_r[0];
339  Fout_r[1 * in_step] = scratch_out_r[1];
340  Fout_r[2 * in_step] = scratch_out_r[2];
341  Fout_r[3 * in_step] = scratch_out_r[3];
342 
343 }
344 
345 NE10_INLINE void ne10_radix4_r2c_with_twiddles_other_butterfly_c (ne10_float32_t *Fout_r,
346  const ne10_float32_t *Fin_r,
347  const ne10_int32_t out_step,
348  const ne10_int32_t in_step,
349  const ne10_fft_cpx_float32_t *twiddles)
350 {
351  ne10_int32_t m_count;
352  ne10_float32_t *Fout_b = Fout_r + (((out_step<<1)-1)<<1) - 2; // reversed
353  ne10_fft_cpx_float32_t scratch_tw[3], scratch_in[4];
354 
355  ne10_fft_cpx_float32_t scratch[4], scratch_out[4];
356 
357  for (m_count = (out_step >> 1) - 1; m_count; m_count --)
358  {
359  scratch_tw [0] = twiddles[0 * out_step];
360  scratch_tw [1] = twiddles[1 * out_step];
361  scratch_tw [2] = twiddles[2 * out_step];
362 
363  scratch_in[0].r = Fin_r[0 * in_step ];
364  scratch_in[0].i = Fin_r[0 * in_step + 1];
365  scratch_in[1].r = Fin_r[1 * in_step ];
366  scratch_in[1].i = Fin_r[1 * in_step + 1];
367  scratch_in[2].r = Fin_r[2 * in_step ];
368  scratch_in[2].i = Fin_r[2 * in_step + 1];
369  scratch_in[3].r = Fin_r[3 * in_step ];
370  scratch_in[3].i = Fin_r[3 * in_step + 1];
371 
372  // NE10_PRINT_Q2_VECTOR(scratch_in);
373 
374  // radix 4 butterfly with twiddles
375  scratch[0].r = scratch_in[0].r;
376  scratch[0].i = scratch_in[0].i;
377  scratch[1].r = scratch_in[1].r * scratch_tw[0].r - scratch_in[1].i * scratch_tw[0].i;
378  scratch[1].i = scratch_in[1].i * scratch_tw[0].r + scratch_in[1].r * scratch_tw[0].i;
379 
380  scratch[2].r = scratch_in[2].r * scratch_tw[1].r - scratch_in[2].i * scratch_tw[1].i;
381  scratch[2].i = scratch_in[2].i * scratch_tw[1].r + scratch_in[2].r * scratch_tw[1].i;
382 
383  scratch[3].r = scratch_in[3].r * scratch_tw[2].r - scratch_in[3].i * scratch_tw[2].i;
384  scratch[3].i = scratch_in[3].i * scratch_tw[2].r + scratch_in[3].r * scratch_tw[2].i;
385 
386  NE10_FFT_R2C_CC_CC(scratch_out,scratch);
387 
388  // NE10_PRINT_Q2_VECTOR(scratch_in);
389 
390  // result
391  Fout_r[ 0] = scratch_out[0].r;
392  Fout_r[ 1] = scratch_out[0].i;
393  Fout_r[ (out_step << 1) ] = scratch_out[1].r;
394  Fout_r[ (out_step << 1) + 1] = scratch_out[1].i;
395  Fout_b[ 0] = scratch_out[2].r;
396  Fout_b[ 1] = scratch_out[2].i;
397  Fout_b[- (out_step << 1) ] = scratch_out[3].r;
398  Fout_b[- (out_step << 1) + 1] = scratch_out[3].i;
399 
400  // update pointers
401  Fin_r += 2;
402  Fout_r += 2;
403  Fout_b -= 2;
404  twiddles ++;
405  }
406 }
407 
408 NE10_INLINE void ne10_radix4_c2r_with_twiddles_other_butterfly_c (ne10_float32_t *Fout_r,
409  const ne10_float32_t *Fin_r,
410  const ne10_int32_t out_step,
411  const ne10_int32_t in_step,
412  const ne10_fft_cpx_float32_t *twiddles)
413 {
414  ne10_int32_t m_count;
415  const ne10_float32_t *Fin_b = Fin_r + (((out_step<<1)-1)<<1) - 2; // reversed
416  ne10_fft_cpx_float32_t scratch_tw [3],
417  scratch [8],
418  scratch_in [4],
419  scratch_out[4];
420 
421  for (m_count = (out_step >> 1) - 1; m_count; m_count --)
422  {
423  scratch_tw[0] = twiddles[0 * out_step];
424  scratch_tw[1] = twiddles[1 * out_step];
425  scratch_tw[2] = twiddles[2 * out_step];
426 
427  scratch_in[0].r = Fin_r[0];
428  scratch_in[0].i = Fin_r[1];
429 
430  scratch_in[1].r = Fin_b[0];
431  scratch_in[1].i = Fin_b[1];
432 
433  scratch_in[2].r = Fin_r[(out_step<<1) + 0];
434  scratch_in[2].i = Fin_r[(out_step<<1) + 1];
435 
436  scratch_in[3].r = Fin_b[-(out_step<<1) + 0];
437  scratch_in[3].i = Fin_b[-(out_step<<1) + 1];
438 
439  // NE10_PRINT_Q2_VECTOR(scratch_in);
440 
441  // // inverse of "result"
442  NE10_FFT_C2R_CC_CC(scratch,scratch_in);
443 
444  // inverse of "mutltipy twiddles"
445  scratch_out[0] = scratch[0];
446 
447  scratch_out[1].r = scratch[1].r * scratch_tw[0].r + scratch[1].i * scratch_tw[0].i;
448  scratch_out[1].i = scratch[1].i * scratch_tw[0].r - scratch[1].r * scratch_tw[0].i;
449 
450  scratch_out[2].r = scratch[2].r * scratch_tw[1].r + scratch[2].i * scratch_tw[1].i;
451  scratch_out[2].i = scratch[2].i * scratch_tw[1].r - scratch[2].r * scratch_tw[1].i;
452 
453  scratch_out[3].r = scratch[3].r * scratch_tw[2].r + scratch[3].i * scratch_tw[2].i;
454  scratch_out[3].i = scratch[3].i * scratch_tw[2].r - scratch[3].r * scratch_tw[2].i;
455 
456  // NE10_PRINT_Q2_VECTOR(scratch_out);
457 
458  // store
459  Fout_r[0 * in_step ] = scratch_out[0].r;
460  Fout_r[0 * in_step + 1] = scratch_out[0].i;
461  Fout_r[1 * in_step ] = scratch_out[1].r;
462  Fout_r[1 * in_step + 1] = scratch_out[1].i;
463  Fout_r[2 * in_step ] = scratch_out[2].r;
464  Fout_r[2 * in_step + 1] = scratch_out[2].i;
465  Fout_r[3 * in_step ] = scratch_out[3].r;
466  Fout_r[3 * in_step + 1] = scratch_out[3].i;
467 
468  // update pointers
469  Fin_r += 2;
470  Fout_r += 2;
471  Fin_b -= 2;
472  twiddles ++;
473  }
474 }
475 
476 NE10_INLINE void ne10_radix4_r2c_with_twiddles_last_butterfly_c (ne10_float32_t *Fout_r,
477  const ne10_float32_t *Fin_r,
478  const ne10_int32_t out_step,
479  const ne10_int32_t in_step,
480  const ne10_fft_cpx_float32_t *twiddles)
481 {
482  ne10_float32_t scratch_in [4];
483  ne10_float32_t scratch_out[4];
484 
485  scratch_in[0] = Fin_r[0 * in_step];
486  scratch_in[1] = Fin_r[1 * in_step];
487  scratch_in[2] = Fin_r[2 * in_step];
488  scratch_in[3] = Fin_r[3 * in_step];
489 
490  // NE10_PRINT_Q_VECTOR(scratch_in);
491 
492  NE10_FFT_R2C_4R_CC(scratch_out,scratch_in);
493 
494  // NE10_PRINT_Q_VECTOR(scratch_out);
495 
496  Fout_r[ 0] = scratch_out[0];
497  Fout_r[ 1] = scratch_out[1];
498  Fout_r[ (out_step << 1) ] = scratch_out[2];
499  Fout_r[ (out_step << 1) + 1] = scratch_out[3];
500 }
501 
502 NE10_INLINE void ne10_radix4_c2r_with_twiddles_last_butterfly_c (ne10_float32_t *Fout_r,
503  const ne10_float32_t *Fin_r,
504  const ne10_int32_t out_step,
505  const ne10_int32_t in_step,
506  const ne10_fft_cpx_float32_t *twiddles)
507 {
508  // inverse operation of ne10_radix4_r2c_with_twiddles_last_butterfly_c
509  ne10_float32_t scratch_in [4];
510  ne10_float32_t scratch_out[4];
511 
512  // load
513  scratch_in[0] = Fin_r[ 0];
514  scratch_in[1] = Fin_r[ 1];
515  scratch_in[2] = Fin_r[ (out_step << 1) ];
516  scratch_in[3] = Fin_r[ (out_step << 1) + 1];
517 
518  // NE10_PRINT_Q_VECTOR(scratch_in);
519 
520  NE10_FFT_C2R_CC_4R(scratch_out,scratch_in);
521 
522  // NE10_PRINT_Q_VECTOR(scratch_out);
523 
524  // store
525  Fout_r[0 * in_step] = scratch_out[0];
526  Fout_r[1 * in_step] = scratch_out[1];
527  Fout_r[2 * in_step] = scratch_out[2];
528  Fout_r[3 * in_step] = scratch_out[3];
529 }
530 
531 NE10_INLINE void ne10_radix4_r2c_with_twiddles_c (ne10_fft_cpx_float32_t *Fout,
532  const ne10_fft_cpx_float32_t *Fin,
533  const ne10_int32_t fstride,
534  const ne10_int32_t mstride,
535  const ne10_int32_t nfft,
536  const ne10_fft_cpx_float32_t *twiddles)
537 {
538  ne10_int32_t f_count;
539  const ne10_int32_t in_step = nfft >> 2;
540  const ne10_int32_t out_step = mstride;
541 
542  const ne10_float32_t *Fin_r = (ne10_float32_t*) Fin;
543  ne10_float32_t *Fout_r = (ne10_float32_t*) Fout;
544  const ne10_fft_cpx_float32_t *tw;
545 
546  Fout_r ++;
547  Fin_r ++;
548 
549  for (f_count = fstride; f_count; f_count --)
550  {
551  tw = twiddles;
552 
553  // first butterfly
554  ne10_radix4_r2c_with_twiddles_first_butterfly_c (Fout_r, Fin_r, out_step, in_step, tw);
555 
556  tw ++;
557  Fin_r ++;
558  Fout_r ++;
559 
560  // other butterfly
561  ne10_radix4_r2c_with_twiddles_other_butterfly_c (Fout_r, Fin_r, out_step, in_step, tw);
562 
563  // update Fin_r, Fout_r, twiddles
564  tw += ( (out_step >> 1) - 1);
565  Fin_r += 2 * ( (out_step >> 1) - 1);
566  Fout_r += 2 * ( (out_step >> 1) - 1);
567 
568  // last butterfly
569  ne10_radix4_r2c_with_twiddles_last_butterfly_c (Fout_r, Fin_r, out_step, in_step, tw);
570  tw ++;
571  Fin_r ++;
572  Fout_r ++;
573 
574  Fout_r += 3 * out_step;
575  } // f_count
576 }
577 
578 NE10_INLINE void ne10_radix4_c2r_with_twiddles_c (ne10_fft_cpx_float32_t *Fout,
579  const ne10_fft_cpx_float32_t *Fin,
580  const ne10_int32_t fstride,
581  const ne10_int32_t mstride,
582  const ne10_int32_t nfft,
583  const ne10_fft_cpx_float32_t *twiddles)
584 {
585  ne10_int32_t f_count;
586  const ne10_int32_t in_step = nfft >> 2;
587  const ne10_int32_t out_step = mstride;
588 
589  const ne10_float32_t *Fin_r = (ne10_float32_t*) Fin;
590  ne10_float32_t *Fout_r = (ne10_float32_t*) Fout;
591  const ne10_fft_cpx_float32_t *tw;
592 
593  for (f_count = fstride; f_count; f_count --)
594  {
595  tw = twiddles;
596 
597  // first butterfly
598  ne10_radix4_c2r_with_twiddles_first_butterfly_c (Fout_r, Fin_r, out_step, in_step, tw);
599 
600  tw ++;
601  Fin_r ++;
602  Fout_r ++;
603 
604  // other butterfly
605  ne10_radix4_c2r_with_twiddles_other_butterfly_c (Fout_r, Fin_r, out_step, in_step, tw);
606 
607  // update Fin_r, Fout_r, twiddles
608  tw += ( (out_step >> 1) - 1);
609  Fin_r += 2 * ( (out_step >> 1) - 1);
610  Fout_r += 2 * ( (out_step >> 1) - 1);
611 
612  // last butterfly
613  ne10_radix4_c2r_with_twiddles_last_butterfly_c (Fout_r, Fin_r, out_step, in_step, tw);
614  tw ++;
615  Fin_r ++;
616  Fout_r ++;
617 
618  Fin_r += 3 * out_step;
619  } // f_count
620 }
621 
622 NE10_INLINE void ne10_mixed_radix_r2c_butterfly_float32_c (
623  ne10_fft_cpx_float32_t * Fout,
624  const ne10_fft_cpx_float32_t * Fin,
625  const ne10_int32_t * factors,
626  const ne10_fft_cpx_float32_t * twiddles,
627  ne10_fft_cpx_float32_t * buffer)
628 {
629  // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
630 
631  ne10_int32_t fstride, mstride, nfft;
632  ne10_int32_t radix;
633  ne10_int32_t stage_count;
634 
635  // init fstride, mstride, radix, nfft
636  stage_count = factors[0];
637  fstride = factors[1];
638  mstride = factors[ (stage_count << 1) - 1 ];
639  radix = factors[ stage_count << 1 ];
640  nfft = radix * fstride;
641 
642  // PRINT_STAGE_INFO;
643 
644  if (stage_count % 2 == 0)
645  {
646  ne10_swap_ptr (buffer, Fout);
647  }
648 
649  // the first stage
650  if (radix == 8) // length of FFT is 2^n (n is odd)
651  {
652  // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
653  ne10_radix8_r2c_c (Fout, Fin, fstride, mstride, nfft);
654  }
655  else if (radix == 4) // length of FFT is 2^n (n is even)
656  {
657  // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
658  ne10_radix4_r2c_c (Fout, Fin, fstride, mstride, nfft);
659  }
660  // end of first stage
661 
662  // others
663  for (; fstride > 1;)
664  {
665  fstride >>= 2;
666  ne10_swap_ptr (buffer, Fout);
667 
668  // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
669  ne10_radix4_r2c_with_twiddles_c (Fout, buffer, fstride, mstride, nfft, twiddles);
670  twiddles += 3 * mstride;
671  mstride <<= 2;
672 
673  } // other stage
674 }
675 
676 NE10_INLINE void ne10_mixed_radix_c2r_butterfly_float32_c (
677  ne10_fft_cpx_float32_t * Fout,
678  const ne10_fft_cpx_float32_t * Fin,
679  const ne10_int32_t * factors,
680  const ne10_fft_cpx_float32_t * twiddles,
681  ne10_fft_cpx_float32_t * buffer)
682 {
683  // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
684 
685  ne10_int32_t fstride, mstride, nfft;
686  ne10_int32_t radix;
687  ne10_int32_t stage_count;
688 
689  // init fstride, mstride, radix, nfft
690  stage_count = factors[0];
691  fstride = factors[1];
692  mstride = factors[ (stage_count << 1) - 1 ];
693  radix = factors[ stage_count << 1 ];
694  nfft = radix * fstride;
695 
696  // fstride, mstride for the last stage
697  fstride = 1;
698  mstride = nfft >> 2;
699  // PRINT_STAGE_INFO;
700 
701  if (stage_count % 2 == 1)
702  {
703  ne10_swap_ptr (buffer, Fout);
704  }
705 
706  // last butterfly -- inversed
707  if (stage_count > 1)
708  {
709  twiddles -= 3 * mstride;
710  // PRINT_STAGE_INFO;
711  // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
712  ne10_radix4_c2r_with_twiddles_c (buffer, Fin, fstride, mstride, nfft, twiddles);
713  fstride <<= 2;
714  mstride >>= 2;
715  stage_count --;
716  }
717 
718  // others but the last stage
719  for (; stage_count > 1;)
720  {
721  twiddles -= 3 * mstride;
722  // PRINT_STAGE_INFO;
723  // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
724  ne10_radix4_c2r_with_twiddles_c (Fout, buffer, fstride, mstride, nfft, twiddles);
725  fstride <<= 2;
726  mstride >>= 2;
727  stage_count --;
728  ne10_swap_ptr (buffer, Fout);
729  } // other stage
730 
731  // first stage -- inversed
732  if (radix == 8) // length of FFT is 2^n (n is odd)
733  {
734  // PRINT_STAGE_INFO;
735  // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
736  ne10_radix8_c2r_c (Fout, buffer, fstride, mstride, nfft);
737  }
738  else if (radix == 4) // length of FFT is 2^n (n is even)
739  {
740  // PRINT_STAGE_INFO;
741  // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
742  ne10_radix4_c2r_c (Fout, buffer, fstride, mstride, nfft);
743  }
744 }
745 
757 {
758  ne10_fft_r2c_cfg_float32_t st = NULL;
759  ne10_int32_t ncfft = nfft >> 1;
760  ne10_int32_t result;
761 
762  ne10_uint32_t memneeded = sizeof (ne10_fft_r2c_state_float32_t)
763  + sizeof (ne10_fft_cpx_float32_t) * nfft /* buffer*/
764  + sizeof (ne10_int32_t) * (NE10_MAXFACTORS * 2) /* r_factors */
765  + sizeof (ne10_int32_t) * (NE10_MAXFACTORS * 2) /* r_factors_neon */
766  + sizeof (ne10_fft_cpx_float32_t) * nfft /* r_twiddles */
767  + sizeof (ne10_fft_cpx_float32_t) * nfft/4 /* r_twiddles_neon */
768  + sizeof (ne10_fft_cpx_float32_t) * (12 + nfft/32*12) /* r_super_twiddles_neon */
769  + NE10_FFT_BYTE_ALIGNMENT; /* 64-bit alignment*/
770 
771  st = (ne10_fft_r2c_cfg_float32_t) NE10_MALLOC (memneeded);
772 
773  if (!st)
774  {
775  return st;
776  }
777 
778  ne10_int32_t i,j;
780  const ne10_float32_t pi = NE10_PI;
781  ne10_float32_t phase1;
782 
783  st->nfft = nfft;
784 
785  uintptr_t address = (uintptr_t) st + sizeof (ne10_fft_r2c_state_float32_t);
787 
788  st->buffer = (ne10_fft_cpx_float32_t*) address;
789  st->r_twiddles = st->buffer + nfft;
790  st->r_factors = (ne10_int32_t*) (st->r_twiddles + nfft);
791  st->r_twiddles_neon = (ne10_fft_cpx_float32_t*) (st->r_factors + (NE10_MAXFACTORS * 2));
792  st->r_factors_neon = (ne10_int32_t*) (st->r_twiddles_neon + nfft/4);
793  st->r_super_twiddles_neon = (ne10_fft_cpx_float32_t*) (st->r_factors_neon + (NE10_MAXFACTORS * 2));
794 
795  if (nfft<16)
796  {
797  return st;
798  }
799 
800  // factors and twiddles for rfft C
801  ne10_factor (nfft, st->r_factors, NE10_FACTOR_EIGHT_FIRST_STAGE);
802 
803  // backward twiddles pointers
804  st->r_twiddles_backward = ne10_fft_generate_twiddles_float32 (st->r_twiddles, st->r_factors, nfft);
805 
806  // factors and twiddles for rfft neon
807  result = ne10_factor (nfft/4, st->r_factors_neon, NE10_FACTOR_EIGHT_FIRST_STAGE);
808  if (result == NE10_ERR)
809  {
810  return st;
811  }
812 
813  // Twiddle table is transposed here to improve cache access performance.
814  st->r_twiddles_neon_backward = ne10_fft_generate_twiddles_transposed_float32 (
815  st->r_twiddles_neon,
816  st->r_factors_neon,
817  nfft/4);
818 
819  // nfft/4 x 4
820  tw = st->r_super_twiddles_neon;
821  for (i = 1; i < 4; i ++)
822  {
823  for (j = 0; j < 4; j++)
824  {
825  phase1 = - 2 * pi * ( (ne10_float32_t) (i * j) / nfft);
826  tw[4*i-4+j].r = (ne10_float32_t) cos (phase1);
827  tw[4*i-4+j].i = (ne10_float32_t) sin (phase1);
828  }
829  }
830 
831  ne10_int32_t k,s;
832  // [nfft/32] x [3] x [4]
833  // k s j
834  for (k=1; k<nfft/32; k++)
835  {
836  // transposed
837  for (s = 1; s < 4; s++)
838  {
839  for (j = 0; j < 4; j++)
840  {
841  phase1 = - 2 * pi * ( (ne10_float32_t) ((k*4+j) * s) / nfft);
842  tw[12*k+j+4*(s-1)].r = (ne10_float32_t) cos (phase1);
843  tw[12*k+j+4*(s-1)].i = (ne10_float32_t) sin (phase1);
844  }
845  }
846  }
847  return st;
848 }
849 
861  ne10_float32_t *fin,
863 {
864  ne10_fft_cpx_float32_t * tmpbuf = cfg->buffer;
865 
866  switch(cfg->nfft)
867  {
868  case 2:
869  ne10_radix2_r2c_c((ne10_fft_cpx_float32_t*) fout, (ne10_fft_cpx_float32_t*) fin);
870  break;
871  case 4:
872  ne10_radix4_r2c_c( (ne10_fft_cpx_float32_t*) fout, ( ne10_fft_cpx_float32_t*) fin,1,1,4);
873  break;
874  case 8:
875  ne10_radix8_r2c_c( (ne10_fft_cpx_float32_t*) fout, ( ne10_fft_cpx_float32_t*) fin,1,1,8);
876  break;
877  default:
878  ne10_mixed_radix_r2c_butterfly_float32_c (
879  fout,
880  (ne10_fft_cpx_float32_t*) fin,
881  cfg->r_factors,
882  cfg->r_twiddles,
883  tmpbuf);
884  break;
885  }
886 
887  fout[0].r = fout[0].i;
888  fout[0].i = 0.0f;
889  fout[(cfg->nfft) >> 1].i = 0.0f;
890 }
891 
905 {
906  ne10_fft_cpx_float32_t * tmpbuf = cfg->buffer;
907 
908  fin[0].i = fin[0].r;
909  fin[0].r = 0.0f;
910  switch(cfg->nfft)
911  {
912  case 2:
913  ne10_radix2_c2r_c((ne10_fft_cpx_float32_t*) fout, (ne10_fft_cpx_float32_t*) &fin[0].i);
914  break;
915  case 4:
916  ne10_radix4_c2r_c( (ne10_fft_cpx_float32_t*) fout, ( ne10_fft_cpx_float32_t*) &fin[0].i,1,1,4);
917  break;
918  case 8:
919  ne10_radix8_c2r_c( (ne10_fft_cpx_float32_t*) fout, ( ne10_fft_cpx_float32_t*) &fin[0].i,1,1,8);
920  break;
921  default:
922  ne10_mixed_radix_c2r_butterfly_float32_c (
923  (ne10_fft_cpx_float32_t*)fout,
924  (ne10_fft_cpx_float32_t*)&fin[0].i, // first real is moved to first image
925  cfg->r_factors,
926  cfg->r_twiddles_backward,
927  tmpbuf);
928  break;
929  }
930  fin[0].r = fin[0].i;
931  fin[0].i = 0.0f;
932 }
933 
938 #endif // NE10_UNROLL_LEVEL
void ne10_fft_c2r_1d_float32_c(ne10_float32_t *fout, ne10_fft_cpx_float32_t *fin, ne10_fft_r2c_cfg_float32_t cfg)
Specific implementation of ne10_fft_c2r_1d_float32 using plain C.
ne10_fft_r2c_cfg_float32_t ne10_fft_alloc_r2c_float32(ne10_int32_t nfft)
Creates a configuration structure for variants of ne10_fft_r2c_1d_float32 and ne10_fft_c2r_1d_float32...
#define NE10_MAXFACTORS
Structure for the floating point FFT function.
Definition: NE10_types.h:229
int32_t ne10_int32_t
Definition: NE10_types.h:76
#define ne10_swap_ptr(X, Y)
ne10_int32_t ne10_factor(ne10_int32_t n, ne10_int32_t *facbuf, ne10_int32_t ne10_factor_flags)
Definition: NE10_fft.c:71
float ne10_float32_t
Definition: NE10_types.h:80
#define NE10_FFT_BYTE_ALIGNMENT
Definition: NE10_fft.h:45
uint32_t ne10_uint32_t
Definition: NE10_types.h:77
#define NE10_PI
NE10 defines a number of macros for use in its function signatures.
Definition: NE10_macros.h:47
ne10_fft_cpx_float32_t * ne10_fft_generate_twiddles_transposed_float32(ne10_fft_cpx_float32_t *twiddles, const ne10_int32_t *factors, const ne10_int32_t nfft)
Definition: NE10_fft.c:330
#define NE10_FFT_R2C_CC_CC(OUT, IN)
#define NE10_FFT_C2R_RCR_4R(OUT, IN)
Definition: NE10_fft_bfly.h:56
#define NE10_FACTOR_EIGHT_FIRST_STAGE
Definition: NE10_fft.h:72
ne10_fft_cpx_float32_t * ne10_fft_generate_twiddles_float32(ne10_fft_cpx_float32_t *twiddles, const ne10_int32_t *factors, const ne10_int32_t nfft)
Definition: NE10_fft.c:320
#define NE10_INLINE
Definition: NE10_fft.h:46
#define NE10_FFT_R2C_4R_CC(OUT, IN)
Definition: NE10_fft_bfly.h:72
#define NE10_MALLOC
Definition: NE10_macros.h:53
#define NE10_BYTE_ALIGNMENT(address, alignment)
Definition: NE10_macros.h:63
#define NE10_ERR
Definition: NE10_types.h:66
ne10_float32_t i
Definition: NE10_types.h:233
void ne10_fft_r2c_1d_float32_c(ne10_fft_cpx_float32_t *fout, ne10_float32_t *fin, ne10_fft_r2c_cfg_float32_t cfg)
Specific implementation of ne10_fft_r2c_1d_float32 using plain C.
ne10_fft_cpx_float32_t * buffer
Definition: NE10_types.h:271
#define NE10_FFT_C2R_CC_CC(OUT, IN)
#define NE10_FFT_R2C_4R_RCR(OUT, IN)
Definition: NE10_fft_bfly.h:42
#define NE10_FFT_C2R_CC_4R(OUT, IN)
Definition: NE10_fft_bfly.h:87
ne10_float32_t r
Definition: NE10_types.h:232
ne10_fft_r2c_state_float32_t * ne10_fft_r2c_cfg_float32_t
Definition: NE10_types.h:289