Project Ne10
An open, optimized software library for the ARM architecture.
NE10_fft_int32.c
Go to the documentation of this file.
1 /*
2  * Copyright 2013-16 ARM Limited and Contributors.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of ARM Limited nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /* license of Kiss FFT */
29 /*
30 Copyright (c) 2003-2010, Mark Borgerding
31 
32 All rights reserved.
33 
34 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
35 
36  * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
37  * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
38  * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
39 
40 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 */
42 
43 /*
44  * NE10 Library : dsp/NE10_fft_int32.c
45  */
46 
47 #include "NE10_types.h"
48 #include "NE10_macros.h"
49 #include "NE10_fft.h"
50 
51 
52 static void ne10_mixed_radix_butterfly_int32_c (ne10_fft_cpx_int32_t * Fout,
54  ne10_int32_t * factors,
55  ne10_fft_cpx_int32_t * twiddles,
56  ne10_fft_cpx_int32_t * buffer,
57  ne10_int32_t scaled_flag)
58 {
59  ne10_int32_t fstride, mstride, N;
60  ne10_int32_t f_count, m_count;
61  ne10_int32_t stage_count;
62 
63  ne10_fft_cpx_int32_t scratch_in[8];
64  ne10_fft_cpx_int32_t scratch_out[8];
65  ne10_fft_cpx_int32_t scratch[16];
66  ne10_fft_cpx_int32_t scratch_tw[6];
67 
68  ne10_fft_cpx_int32_t *Fin1, *Fin2, *Fout1, *Fout2;
69  ne10_fft_cpx_int32_t *Fout_ls = Fout;
71  ne10_fft_cpx_int32_t *tw, *tw1, *tw2;
72  const ne10_int32_t TW_81 = 1518500249;
73  const ne10_int32_t TW_81N = -1518500249;
74 
75  // init fstride, mstride, N
76  stage_count = factors[0];
77  fstride = factors[1];
78  mstride = factors[ (stage_count << 1) - 1 ];
79  N = factors[ stage_count << 1 ]; // radix
80  tw = twiddles;
81 
82  // the first stage
83  Fin1 = Fin;
84  Fout1 = Fout;
85  if (N == 8) // length of FFT is 2^n (n is odd)
86  {
87  // radix 8
88  N = fstride << 1; // 1/4 of length of FFT
89  tw = twiddles;
90 
91  Fin1 = Fin;
92  for (f_count = 0; f_count < fstride; f_count ++)
93  {
94  Fout1 = & Fout[ f_count * 8 ];
95  // load
96  if (scaled_flag == 1)
97  {
98  NE10_F2I32_FIXDIV (Fin1[0], 8);
99  NE10_F2I32_FIXDIV (Fin1[fstride * 4], 8);
100  NE10_F2I32_FIXDIV (Fin1[fstride], 8);
101  NE10_F2I32_FIXDIV (Fin1[fstride * 5], 8);
102  NE10_F2I32_FIXDIV (Fin1[fstride * 2], 8);
103  NE10_F2I32_FIXDIV (Fin1[fstride * 6], 8);
104  NE10_F2I32_FIXDIV (Fin1[fstride * 3], 8);
105  NE10_F2I32_FIXDIV (Fin1[fstride * 7], 8);
106  }
107  scratch_in[0].r = Fin1[0].r + Fin1[fstride * 4].r;
108  scratch_in[0].i = Fin1[0].i + Fin1[fstride * 4].i;
109  scratch_in[1].r = Fin1[0].r - Fin1[fstride * 4].r;
110  scratch_in[1].i = Fin1[0].i - Fin1[fstride * 4].i;
111  scratch_in[2].r = Fin1[fstride].r + Fin1[fstride * 5].r;
112  scratch_in[2].i = Fin1[fstride].i + Fin1[fstride * 5].i;
113  scratch_in[3].r = Fin1[fstride].r - Fin1[fstride * 5].r;
114  scratch_in[3].i = Fin1[fstride].i - Fin1[fstride * 5].i;
115  scratch_in[4].r = Fin1[fstride * 2].r + Fin1[fstride * 6].r;
116  scratch_in[4].i = Fin1[fstride * 2].i + Fin1[fstride * 6].i;
117  scratch_in[5].r = Fin1[fstride * 2].r - Fin1[fstride * 6].r;
118  scratch_in[5].i = Fin1[fstride * 2].i - Fin1[fstride * 6].i;
119  scratch_in[6].r = Fin1[fstride * 3].r + Fin1[fstride * 7].r;
120  scratch_in[6].i = Fin1[fstride * 3].i + Fin1[fstride * 7].i;
121  scratch_in[7].r = Fin1[fstride * 3].r - Fin1[fstride * 7].r;
122  scratch_in[7].i = Fin1[fstride * 3].i - Fin1[fstride * 7].i;
123 
124  // radix 4 butterfly without twiddles
125  scratch[0] = scratch_in[0];
126  scratch[1] = scratch_in[1];
127 
128  scratch[2] = scratch_in[2];
129  scratch[3].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) (scratch_in[3].r + scratch_in[3].i) * TW_81) >> 31);
130  scratch[3].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) (scratch_in[3].i - scratch_in[3].r) * TW_81) >> 31);
131 
132  scratch[4] = scratch_in[4];
133  scratch[5].r = scratch_in[5].i;
134  scratch[5].i = -scratch_in[5].r;
135 
136  scratch[6].r = scratch_in[6].r;
137  scratch[6].i = scratch_in[6].i;
138  scratch[7].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) (scratch_in[7].r - scratch_in[7].i) * TW_81N) >> 31);
139  scratch[7].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) (scratch_in[7].i + scratch_in[7].r) * TW_81N) >> 31);
140 
141  // radix 2 butterfly
142  scratch[8].r = scratch[0].r + scratch[4].r;
143  scratch[8].i = scratch[0].i + scratch[4].i;
144  scratch[9].r = scratch[1].r + scratch[5].r;
145  scratch[9].i = scratch[1].i + scratch[5].i;
146 
147  scratch[10].r = scratch[0].r - scratch[4].r;
148  scratch[10].i = scratch[0].i - scratch[4].i;
149  scratch[11].r = scratch[1].r - scratch[5].r;
150  scratch[11].i = scratch[1].i - scratch[5].i;
151 
152  // radix 2 butterfly
153  scratch[12].r = scratch[2].r + scratch[6].r;
154  scratch[12].i = scratch[2].i + scratch[6].i;
155  scratch[13].r = scratch[3].r + scratch[7].r;
156  scratch[13].i = scratch[3].i + scratch[7].i;
157 
158  scratch[14].r = scratch[2].r - scratch[6].r;
159  scratch[14].i = scratch[2].i - scratch[6].i;
160  scratch[15].r = scratch[3].r - scratch[7].r;
161  scratch[15].i = scratch[3].i - scratch[7].i;
162 
163  // third result
164  scratch_out[4].r = scratch[8].r - scratch[12].r;
165  scratch_out[4].i = scratch[8].i - scratch[12].i;
166  scratch_out[5].r = scratch[9].r - scratch[13].r;
167  scratch_out[5].i = scratch[9].i - scratch[13].i;
168 
169  // first result
170  scratch_out[0].r = scratch[8].r + scratch[12].r;
171  scratch_out[0].i = scratch[8].i + scratch[12].i;
172  scratch_out[1].r = scratch[9].r + scratch[13].r;
173  scratch_out[1].i = scratch[9].i + scratch[13].i;
174 
175  // second result
176  scratch_out[2].r = scratch[10].r + scratch[14].i;
177  scratch_out[2].i = scratch[10].i - scratch[14].r;
178  scratch_out[3].r = scratch[11].r + scratch[15].i;
179  scratch_out[3].i = scratch[11].i - scratch[15].r;
180 
181  // forth result
182  scratch_out[6].r = scratch[10].r - scratch[14].i;
183  scratch_out[6].i = scratch[10].i + scratch[14].r;
184  scratch_out[7].r = scratch[11].r - scratch[15].i;
185  scratch_out[7].i = scratch[11].i + scratch[15].r;
186 
187  // store
188  Fout1[0] = scratch_out[0];
189  Fout1[1] = scratch_out[1];
190  Fout1[2] = scratch_out[2];
191  Fout1[3] = scratch_out[3];
192  Fout1[4] = scratch_out[4];
193  Fout1[5] = scratch_out[5];
194  Fout1[6] = scratch_out[6];
195  Fout1[7] = scratch_out[7];
196 
197  Fin1 += 1;
198  } // f_count
199  fstride >>= 2;
200  stage_count--;
201 
202  // swap
203  Ftmp = buffer;
204  buffer = Fout;
205  Fout = Ftmp;
206  }
207  else if (N == 4) // length of FFT is 2^n (n is even) >= 4
208  {
209  //fstride is nfft>>2
210  for (f_count = fstride; f_count ; f_count --)
211  {
212  // load
213  scratch_in[0] = *Fin1;
214  Fin2 = Fin1 + fstride;
215  scratch_in[1] = *Fin2;
216  Fin2 = Fin2 + fstride;
217  scratch_in[2] = *Fin2;
218  Fin2 = Fin2 + fstride;
219  scratch_in[3] = *Fin2;
220 
221  // radix 4 butterfly without twiddles
222  if (scaled_flag == 1)
223  {
224  NE10_F2I32_FIXDIV (scratch_in[0], 4);
225  NE10_F2I32_FIXDIV (scratch_in[1], 4);
226  NE10_F2I32_FIXDIV (scratch_in[2], 4);
227  NE10_F2I32_FIXDIV (scratch_in[3], 4);
228  }
229 
230  // radix 2 butterfly
231  scratch[0].r = scratch_in[0].r + scratch_in[2].r;
232  scratch[0].i = scratch_in[0].i + scratch_in[2].i;
233 
234  scratch[1].r = scratch_in[0].r - scratch_in[2].r;
235  scratch[1].i = scratch_in[0].i - scratch_in[2].i;
236 
237  // radix 2 butterfly
238  scratch[2].r = scratch_in[1].r + scratch_in[3].r;
239  scratch[2].i = scratch_in[1].i + scratch_in[3].i;
240 
241  scratch[3].r = scratch_in[1].r - scratch_in[3].r;
242  scratch[3].i = scratch_in[1].i - scratch_in[3].i;
243 
244  // third result
245  scratch_out[2].r = scratch[0].r - scratch[2].r;
246  scratch_out[2].i = scratch[0].i - scratch[2].i;
247 
248  // first result
249  scratch_out[0].r = scratch[0].r + scratch[2].r;
250  scratch_out[0].i = scratch[0].i + scratch[2].i;
251 
252  // second result
253  scratch_out[1].r = scratch[1].r + scratch[3].i;
254  scratch_out[1].i = scratch[1].i - scratch[3].r;
255 
256  // forth result
257  scratch_out[3].r = scratch[1].r - scratch[3].i;
258  scratch_out[3].i = scratch[1].i + scratch[3].r;
259 
260  // store
261  * Fout1 ++ = scratch_out[0];
262  * Fout1 ++ = scratch_out[1];
263  * Fout1 ++ = scratch_out[2];
264  * Fout1 ++ = scratch_out[3];
265 
266  Fin1++;
267  } // f_count
268 
269  N = fstride; // 1/4 of length of FFT
270 
271  // swap
272  Ftmp = buffer;
273  buffer = Fout;
274  Fout = Ftmp;
275 
276  // update address for other stages
277  stage_count--;
278  fstride >>= 2;
279  // end of first stage
280  }
281  else if (N == 2) // Length of FFT is 2
282  {
283  scratch_in[0] = Fin1[0];
284  scratch_in[1] = Fin1[1];
285 
286  if (scaled_flag == 1)
287  {
288  NE10_F2I32_FIXDIV (scratch_in[0], 2);
289  NE10_F2I32_FIXDIV (scratch_in[1], 2);
290  }
291 
292  Fout1[0].r = scratch_in[0].r + scratch_in[1].r;
293  Fout1[0].i = scratch_in[0].i + scratch_in[1].i;
294  Fout1[1].r = scratch_in[0].r - scratch_in[1].r;
295  Fout1[1].i = scratch_in[0].i - scratch_in[1].i;
296  return;
297  }
298  else // Length of FFT is 1
299  {
300  Fout1[0] = Fin1[0];
301  return;
302  }
303 
304  // others but the last one
305  for (; stage_count > 1 ; stage_count--)
306  {
307  Fin1 = buffer;
308  for (f_count = 0; f_count < fstride; f_count ++)
309  {
310  Fout1 = & Fout[ f_count * mstride << 2 ];
311  tw1 = tw;
312  for (m_count = mstride; m_count ; m_count --)
313  {
314  // load
315  scratch_tw[0] = *tw1;
316  tw2 = tw1 + mstride;
317  scratch_tw[1] = *tw2;
318  tw2 += mstride;
319  scratch_tw[2] = *tw2;
320  scratch_in[0] = * Fin1;
321  Fin2 = Fin1 + N;
322  scratch_in[1] = * Fin2;
323  Fin2 += N;
324  scratch_in[2] = * Fin2;
325  Fin2 += N;
326  scratch_in[3] = * Fin2;
327  if (scaled_flag == 1)
328  {
329  NE10_F2I32_FIXDIV (scratch_in[0], 4);
330  NE10_F2I32_FIXDIV (scratch_in[1], 4);
331  NE10_F2I32_FIXDIV (scratch_in[2], 4);
332  NE10_F2I32_FIXDIV (scratch_in[3], 4);
333  }
334 
335  // radix 4 butterfly with twiddles
336 
337  scratch[0] = scratch_in[0];
338  scratch[1].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[1].r * scratch_tw[0].r
339  - (NE10_F2I32_SAMPPROD) scratch_in[1].i * scratch_tw[0].i) >> 31);
340  scratch[1].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[1].i * scratch_tw[0].r
341  + (NE10_F2I32_SAMPPROD) scratch_in[1].r * scratch_tw[0].i) >> 31);
342 
343  scratch[2].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[2].r * scratch_tw[1].r
344  - (NE10_F2I32_SAMPPROD) scratch_in[2].i * scratch_tw[1].i) >> 31);
345  scratch[2].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[2].i * scratch_tw[1].r
346  + (NE10_F2I32_SAMPPROD) scratch_in[2].r * scratch_tw[1].i) >> 31);
347 
348  scratch[3].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[3].r * scratch_tw[2].r
349  - (NE10_F2I32_SAMPPROD) scratch_in[3].i * scratch_tw[2].i) >> 31);
350  scratch[3].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[3].i * scratch_tw[2].r
351  + (NE10_F2I32_SAMPPROD) scratch_in[3].r * scratch_tw[2].i) >> 31);
352 
353  // radix 2 butterfly
354  scratch[4].r = scratch[0].r + scratch[2].r;
355  scratch[4].i = scratch[0].i + scratch[2].i;
356 
357  scratch[5].r = scratch[0].r - scratch[2].r;
358  scratch[5].i = scratch[0].i - scratch[2].i;
359 
360  // radix 2 butterfly
361  scratch[6].r = scratch[1].r + scratch[3].r;
362  scratch[6].i = scratch[1].i + scratch[3].i;
363 
364  scratch[7].r = scratch[1].r - scratch[3].r;
365  scratch[7].i = scratch[1].i - scratch[3].i;
366 
367  // third result
368  scratch_out[2].r = scratch[4].r - scratch[6].r;
369  scratch_out[2].i = scratch[4].i - scratch[6].i;
370 
371  // first result
372  scratch_out[0].r = scratch[4].r + scratch[6].r;
373  scratch_out[0].i = scratch[4].i + scratch[6].i;
374 
375  // second result
376  scratch_out[1].r = scratch[5].r + scratch[7].i;
377  scratch_out[1].i = scratch[5].i - scratch[7].r;
378 
379  // forth result
380  scratch_out[3].r = scratch[5].r - scratch[7].i;
381  scratch_out[3].i = scratch[5].i + scratch[7].r;
382 
383  // store
384  *Fout1 = scratch_out[0];
385  Fout2 = Fout1 + mstride;
386  *Fout2 = scratch_out[1];
387  Fout2 += mstride;
388  *Fout2 = scratch_out[2];
389  Fout2 += mstride;
390  *Fout2 = scratch_out[3];
391 
392  tw1++;
393  Fin1 ++;
394  Fout1 ++;
395  } // m_count
396  } // f_count
397  tw += mstride * 3;
398  mstride <<= 2;
399  // swap
400  Ftmp = buffer;
401  buffer = Fout;
402  Fout = Ftmp;
403  fstride >>= 2;
404  } // stage_count
405 
406  // the last one
407  if (stage_count)
408  {
409  Fin1 = buffer;
410  // if stage count is even, output to the input array
411  Fout1 = Fout_ls;
412 
413  for (f_count = 0; f_count < fstride; f_count ++)
414  {
415  tw1 = tw;
416  for (m_count = mstride; m_count ; m_count --)
417  {
418  // load
419  scratch_tw[0] = *tw1;
420  tw2 = tw1 + mstride;
421  scratch_tw[1] = *tw2;
422  tw2 += mstride;
423  scratch_tw[2] = *tw2;
424  scratch_in[0] = * Fin1;
425  Fin2 = Fin1 + N;
426  scratch_in[1] = * Fin2;
427  Fin2 += N;
428  scratch_in[2] = * Fin2;
429  Fin2 += N;
430  scratch_in[3] = * Fin2;
431  if (scaled_flag == 1)
432  {
433  NE10_F2I32_FIXDIV (scratch_in[0], 4);
434  NE10_F2I32_FIXDIV (scratch_in[1], 4);
435  NE10_F2I32_FIXDIV (scratch_in[2], 4);
436  NE10_F2I32_FIXDIV (scratch_in[3], 4);
437  }
438 
439  // radix 4 butterfly with twiddles
440 
441  scratch[0] = scratch_in[0];
442  scratch[1].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[1].r * scratch_tw[0].r
443  - (NE10_F2I32_SAMPPROD) scratch_in[1].i * scratch_tw[0].i) >> 31);
444  scratch[1].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[1].i * scratch_tw[0].r
445  + (NE10_F2I32_SAMPPROD) scratch_in[1].r * scratch_tw[0].i) >> 31);
446 
447  scratch[2].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[2].r * scratch_tw[1].r
448  - (NE10_F2I32_SAMPPROD) scratch_in[2].i * scratch_tw[1].i) >> 31);
449  scratch[2].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[2].i * scratch_tw[1].r
450  + (NE10_F2I32_SAMPPROD) scratch_in[2].r * scratch_tw[1].i) >> 31);
451 
452  scratch[3].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[3].r * scratch_tw[2].r
453  - (NE10_F2I32_SAMPPROD) scratch_in[3].i * scratch_tw[2].i) >> 31);
454  scratch[3].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[3].i * scratch_tw[2].r
455  + (NE10_F2I32_SAMPPROD) scratch_in[3].r * scratch_tw[2].i) >> 31);
456 
457  // radix 2 butterfly
458  scratch[4].r = scratch[0].r + scratch[2].r;
459  scratch[4].i = scratch[0].i + scratch[2].i;
460 
461  scratch[5].r = scratch[0].r - scratch[2].r;
462  scratch[5].i = scratch[0].i - scratch[2].i;
463 
464  // radix 2 butterfly
465  scratch[6].r = scratch[1].r + scratch[3].r;
466  scratch[6].i = scratch[1].i + scratch[3].i;
467 
468  scratch[7].r = scratch[1].r - scratch[3].r;
469  scratch[7].i = scratch[1].i - scratch[3].i;
470 
471  // third result
472  scratch_out[2].r = scratch[4].r - scratch[6].r;
473  scratch_out[2].i = scratch[4].i - scratch[6].i;
474 
475  // first result
476  scratch_out[0].r = scratch[4].r + scratch[6].r;
477  scratch_out[0].i = scratch[4].i + scratch[6].i;
478 
479  // second result
480  scratch_out[1].r = scratch[5].r + scratch[7].i;
481  scratch_out[1].i = scratch[5].i - scratch[7].r;
482 
483  // forth result
484  scratch_out[3].r = scratch[5].r - scratch[7].i;
485  scratch_out[3].i = scratch[5].i + scratch[7].r;
486 
487  // store
488  *Fout1 = scratch_out[0];
489  Fout2 = Fout1 + N;
490  *Fout2 = scratch_out[1];
491  Fout2 += N;
492  *Fout2 = scratch_out[2];
493  Fout2 += N;
494  *Fout2 = scratch_out[3];
495 
496  tw1 ++;
497  Fin1 ++;
498  Fout1 ++;
499  } // m_count
500  } // f_count
501  } // last stage
502 }
503 
504 static void ne10_mixed_radix_butterfly_inverse_int32_c (ne10_fft_cpx_int32_t * Fout,
505  ne10_fft_cpx_int32_t * Fin,
506  ne10_int32_t * factors,
507  ne10_fft_cpx_int32_t * twiddles,
508  ne10_fft_cpx_int32_t * buffer,
509  ne10_int32_t scaled_flag)
510 {
511  ne10_int32_t fstride, mstride, N;
512  ne10_int32_t f_count, m_count;
513  ne10_int32_t stage_count;
514 
515  ne10_fft_cpx_int32_t scratch_in[8];
516  ne10_fft_cpx_int32_t scratch_out[8];
517  ne10_fft_cpx_int32_t scratch[16];
518  ne10_fft_cpx_int32_t scratch_tw[6];
519 
520  ne10_fft_cpx_int32_t *Fin1, *Fin2, *Fout1, *Fout2;
521  ne10_fft_cpx_int32_t *Fout_ls = Fout;
522  ne10_fft_cpx_int32_t *Ftmp;
523  ne10_fft_cpx_int32_t *tw, *tw1, *tw2;
524  const ne10_int32_t TW_81 = 1518500249;
525  const ne10_int32_t TW_81N = -1518500249;
526 
527  // init fstride, mstride, N
528  stage_count = factors[0];
529  fstride = factors[1];
530  mstride = factors[ (stage_count << 1) - 1 ];
531  N = factors[ stage_count << 1 ]; // radix
532  tw = twiddles;
533 
534  // the first stage
535  Fin1 = Fin;
536  Fout1 = Fout;
537  if (N == 8) // length of FFT is 2^n (n is odd)
538  {
539  // radix 8
540  N = fstride << 1; // 1/4 of length of FFT
541  tw = twiddles;
542 
543  Fin1 = Fin;
544  for (f_count = 0; f_count < fstride; f_count ++)
545  {
546  Fout1 = & Fout[ f_count * 8 ];
547  // load
548  if (scaled_flag == 1)
549  {
550  NE10_F2I32_FIXDIV (Fin1[0], 8);
551  NE10_F2I32_FIXDIV (Fin1[fstride * 4], 8);
552  NE10_F2I32_FIXDIV (Fin1[fstride], 8);
553  NE10_F2I32_FIXDIV (Fin1[fstride * 5], 8);
554  NE10_F2I32_FIXDIV (Fin1[fstride * 2], 8);
555  NE10_F2I32_FIXDIV (Fin1[fstride * 6], 8);
556  NE10_F2I32_FIXDIV (Fin1[fstride * 3], 8);
557  NE10_F2I32_FIXDIV (Fin1[fstride * 7], 8);
558  }
559 
560  scratch_in[0].r = Fin1[0].r + Fin1[fstride * 4].r;
561  scratch_in[0].i = Fin1[0].i + Fin1[fstride * 4].i;
562  scratch_in[1].r = Fin1[0].r - Fin1[fstride * 4].r;
563  scratch_in[1].i = Fin1[0].i - Fin1[fstride * 4].i;
564  scratch_in[2].r = Fin1[fstride].r + Fin1[fstride * 5].r;
565  scratch_in[2].i = Fin1[fstride].i + Fin1[fstride * 5].i;
566  scratch_in[3].r = Fin1[fstride].r - Fin1[fstride * 5].r;
567  scratch_in[3].i = Fin1[fstride].i - Fin1[fstride * 5].i;
568  scratch_in[4].r = Fin1[fstride * 2].r + Fin1[fstride * 6].r;
569  scratch_in[4].i = Fin1[fstride * 2].i + Fin1[fstride * 6].i;
570  scratch_in[5].r = Fin1[fstride * 2].r - Fin1[fstride * 6].r;
571  scratch_in[5].i = Fin1[fstride * 2].i - Fin1[fstride * 6].i;
572  scratch_in[6].r = Fin1[fstride * 3].r + Fin1[fstride * 7].r;
573  scratch_in[6].i = Fin1[fstride * 3].i + Fin1[fstride * 7].i;
574  scratch_in[7].r = Fin1[fstride * 3].r - Fin1[fstride * 7].r;
575  scratch_in[7].i = Fin1[fstride * 3].i - Fin1[fstride * 7].i;
576 
577  // radix 4 butterfly with twiddles
578 
579  scratch[0] = scratch_in[0];
580  scratch[1] = scratch_in[1];
581 
582  scratch[2] = scratch_in[2];
583  scratch[3].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) (scratch_in[3].r - scratch_in[3].i) * TW_81) >> 31);
584  scratch[3].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) (scratch_in[3].i + scratch_in[3].r) * TW_81) >> 31);
585 
586  scratch[4] = scratch_in[4];
587  scratch[5].r = -scratch_in[5].i;
588  scratch[5].i = scratch_in[5].r;
589 
590  scratch[6].r = scratch_in[6].r;
591  scratch[6].i = scratch_in[6].i;
592  scratch[7].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) (scratch_in[7].r + scratch_in[7].i) * TW_81N) >> 31);
593  scratch[7].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) (scratch_in[7].i - scratch_in[7].r) * TW_81N) >> 31);
594 
595  // radix 2 butterfly
596  scratch[8].r = scratch[0].r + scratch[4].r;
597  scratch[8].i = scratch[0].i + scratch[4].i;
598  scratch[9].r = scratch[1].r + scratch[5].r;
599  scratch[9].i = scratch[1].i + scratch[5].i;
600 
601  scratch[10].r = scratch[0].r - scratch[4].r;
602  scratch[10].i = scratch[0].i - scratch[4].i;
603  scratch[11].r = scratch[1].r - scratch[5].r;
604  scratch[11].i = scratch[1].i - scratch[5].i;
605 
606  // radix 2 butterfly
607  scratch[12].r = scratch[2].r + scratch[6].r;
608  scratch[12].i = scratch[2].i + scratch[6].i;
609  scratch[13].r = scratch[3].r + scratch[7].r;
610  scratch[13].i = scratch[3].i + scratch[7].i;
611 
612  scratch[14].r = scratch[2].r - scratch[6].r;
613  scratch[14].i = scratch[2].i - scratch[6].i;
614  scratch[15].r = scratch[3].r - scratch[7].r;
615  scratch[15].i = scratch[3].i - scratch[7].i;
616 
617  // third result
618  scratch_out[4].r = scratch[8].r - scratch[12].r;
619  scratch_out[4].i = scratch[8].i - scratch[12].i;
620  scratch_out[5].r = scratch[9].r - scratch[13].r;
621  scratch_out[5].i = scratch[9].i - scratch[13].i;
622 
623  // first result
624  scratch_out[0].r = scratch[8].r + scratch[12].r;
625  scratch_out[0].i = scratch[8].i + scratch[12].i;
626  scratch_out[1].r = scratch[9].r + scratch[13].r;
627  scratch_out[1].i = scratch[9].i + scratch[13].i;
628 
629  // second result
630  scratch_out[2].r = scratch[10].r - scratch[14].i;
631  scratch_out[2].i = scratch[10].i + scratch[14].r;
632  scratch_out[3].r = scratch[11].r - scratch[15].i;
633  scratch_out[3].i = scratch[11].i + scratch[15].r;
634 
635  // forth result
636  scratch_out[6].r = scratch[10].r + scratch[14].i;
637  scratch_out[6].i = scratch[10].i - scratch[14].r;
638  scratch_out[7].r = scratch[11].r + scratch[15].i;
639  scratch_out[7].i = scratch[11].i - scratch[15].r;
640 
641  // store
642  Fout1[0] = scratch_out[0];
643  Fout1[1] = scratch_out[1];
644  Fout1[2] = scratch_out[2];
645  Fout1[3] = scratch_out[3];
646  Fout1[4] = scratch_out[4];
647  Fout1[5] = scratch_out[5];
648  Fout1[6] = scratch_out[6];
649  Fout1[7] = scratch_out[7];
650 
651  Fin1 += 1;
652  } // f_count
653  fstride >>= 2;
654  stage_count--;
655 
656  // swap
657  Ftmp = buffer;
658  buffer = Fout;
659  Fout = Ftmp;
660  }
661  else if (N == 4) // length of FFT is 2^n (n is even) >= 4
662  {
663  //fstride is nfft>>2
664  for (f_count = fstride; f_count ; f_count --)
665  {
666  // load
667  scratch_in[0] = *Fin1;
668  Fin2 = Fin1 + fstride;
669  scratch_in[1] = *Fin2;
670  Fin2 = Fin2 + fstride;
671  scratch_in[2] = *Fin2;
672  Fin2 = Fin2 + fstride;
673  scratch_in[3] = *Fin2;
674 
675  // radix 4 butterfly without twiddles
676  if (scaled_flag == 1)
677  {
678  NE10_F2I32_FIXDIV (scratch_in[0], 4);
679  NE10_F2I32_FIXDIV (scratch_in[1], 4);
680  NE10_F2I32_FIXDIV (scratch_in[2], 4);
681  NE10_F2I32_FIXDIV (scratch_in[3], 4);
682  }
683 
684  // radix 2 butterfly
685  scratch[0].r = scratch_in[0].r + scratch_in[2].r;
686  scratch[0].i = scratch_in[0].i + scratch_in[2].i;
687 
688  scratch[1].r = scratch_in[0].r - scratch_in[2].r;
689  scratch[1].i = scratch_in[0].i - scratch_in[2].i;
690 
691  // radix 2 butterfly
692  scratch[2].r = scratch_in[1].r + scratch_in[3].r;
693  scratch[2].i = scratch_in[1].i + scratch_in[3].i;
694 
695  scratch[3].r = scratch_in[1].r - scratch_in[3].r;
696  scratch[3].i = scratch_in[1].i - scratch_in[3].i;
697 
698  // third result
699  scratch_out[2].r = scratch[0].r - scratch[2].r;
700  scratch_out[2].i = scratch[0].i - scratch[2].i;
701 
702  // first result
703  scratch_out[0].r = scratch[0].r + scratch[2].r;
704  scratch_out[0].i = scratch[0].i + scratch[2].i;
705 
706  // second result
707  scratch_out[1].r = scratch[1].r - scratch[3].i;
708  scratch_out[1].i = scratch[1].i + scratch[3].r;
709 
710  // forth result
711  scratch_out[3].r = scratch[1].r + scratch[3].i;
712  scratch_out[3].i = scratch[1].i - scratch[3].r;
713 
714  // store
715  * Fout1 ++ = scratch_out[0];
716  * Fout1 ++ = scratch_out[1];
717  * Fout1 ++ = scratch_out[2];
718  * Fout1 ++ = scratch_out[3];
719 
720  Fin1++;
721  } // f_count
722 
723  N = fstride; // 1/4 of length of FFT
724  // update address for other stages
725  stage_count--;
726  fstride >>= 2;
727 
728  // swap
729  Ftmp = buffer;
730  buffer = Fout;
731  Fout = Ftmp;
732 
733  // end of first stage
734  }
735  else if (N == 2) // Length of FFT is 2
736  {
737  scratch_in[0] = Fin1[0];
738  scratch_in[1] = Fin1[1];
739 
740  if (scaled_flag == 1)
741  {
742  NE10_F2I32_FIXDIV (scratch_in[0], 2);
743  NE10_F2I32_FIXDIV (scratch_in[1], 2);
744  }
745 
746  Fout1[0].r = scratch_in[0].r + scratch_in[1].r;
747  Fout1[0].i = scratch_in[0].i + scratch_in[1].i;
748  Fout1[1].r = scratch_in[0].r - scratch_in[1].r;
749  Fout1[1].i = scratch_in[0].i - scratch_in[1].i;
750  return;
751  }
752  else // Length of FFT is 1
753  {
754  Fout1[0] = Fin1[0];
755  return;
756  }
757 
758 
759  // others but the last one
760  for (; stage_count > 1 ; stage_count--)
761  {
762  Fin1 = buffer;
763  for (f_count = 0; f_count < fstride; f_count ++)
764  {
765  Fout1 = & Fout[ f_count * mstride << 2 ];
766  tw1 = tw;
767  for (m_count = mstride; m_count ; m_count --)
768  {
769  // load
770  scratch_tw[0] = *tw1;
771  tw2 = tw1 + mstride;
772  scratch_tw[1] = *tw2;
773  tw2 += mstride;
774  scratch_tw[2] = *tw2;
775  scratch_in[0] = * Fin1;
776  Fin2 = Fin1 + N;
777  scratch_in[1] = * Fin2;
778  Fin2 += N;
779  scratch_in[2] = * Fin2;
780  Fin2 += N;
781  scratch_in[3] = * Fin2;
782  if (scaled_flag == 1)
783  {
784  NE10_F2I32_FIXDIV (scratch_in[0], 4);
785  NE10_F2I32_FIXDIV (scratch_in[1], 4);
786  NE10_F2I32_FIXDIV (scratch_in[2], 4);
787  NE10_F2I32_FIXDIV (scratch_in[3], 4);
788  }
789 
790  // radix 4 butterfly with twiddles
791 
792  scratch[0] = scratch_in[0];
793  scratch[1].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[1].r * scratch_tw[0].r
794  + (NE10_F2I32_SAMPPROD) scratch_in[1].i * scratch_tw[0].i) >> 31);
795  scratch[1].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[1].i * scratch_tw[0].r
796  - (NE10_F2I32_SAMPPROD) scratch_in[1].r * scratch_tw[0].i) >> 31);
797 
798  scratch[2].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[2].r * scratch_tw[1].r
799  + (NE10_F2I32_SAMPPROD) scratch_in[2].i * scratch_tw[1].i) >> 31);
800  scratch[2].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[2].i * scratch_tw[1].r
801  - (NE10_F2I32_SAMPPROD) scratch_in[2].r * scratch_tw[1].i) >> 31);
802 
803  scratch[3].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[3].r * scratch_tw[2].r
804  + (NE10_F2I32_SAMPPROD) scratch_in[3].i * scratch_tw[2].i) >> 31);
805  scratch[3].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[3].i * scratch_tw[2].r
806  - (NE10_F2I32_SAMPPROD) scratch_in[3].r * scratch_tw[2].i) >> 31);
807 
808  // radix 2 butterfly
809  scratch[4].r = scratch[0].r + scratch[2].r;
810  scratch[4].i = scratch[0].i + scratch[2].i;
811 
812  scratch[5].r = scratch[0].r - scratch[2].r;
813  scratch[5].i = scratch[0].i - scratch[2].i;
814 
815  // radix 2 butterfly
816  scratch[6].r = scratch[1].r + scratch[3].r;
817  scratch[6].i = scratch[1].i + scratch[3].i;
818 
819  scratch[7].r = scratch[1].r - scratch[3].r;
820  scratch[7].i = scratch[1].i - scratch[3].i;
821 
822  // third result
823  scratch_out[2].r = scratch[4].r - scratch[6].r;
824  scratch_out[2].i = scratch[4].i - scratch[6].i;
825 
826  // first result
827  scratch_out[0].r = scratch[4].r + scratch[6].r;
828  scratch_out[0].i = scratch[4].i + scratch[6].i;
829 
830  // second result
831  scratch_out[1].r = scratch[5].r - scratch[7].i;
832  scratch_out[1].i = scratch[5].i + scratch[7].r;
833 
834  // forth result
835  scratch_out[3].r = scratch[5].r + scratch[7].i;
836  scratch_out[3].i = scratch[5].i - scratch[7].r;
837 
838  // store
839  *Fout1 = scratch_out[0];
840  Fout2 = Fout1 + mstride;
841  *Fout2 = scratch_out[1];
842  Fout2 += mstride;
843  *Fout2 = scratch_out[2];
844  Fout2 += mstride;
845  *Fout2 = scratch_out[3];
846 
847  tw1++;
848  Fin1 ++;
849  Fout1 ++;
850  } // m_count
851  } // f_count
852  tw += mstride * 3;
853  mstride <<= 2;
854  // swap
855  Ftmp = buffer;
856  buffer = Fout;
857  Fout = Ftmp;
858  fstride >>= 2;
859  } // stage_count
860 
861  // the last one
862  if (stage_count)
863  {
864  Fin1 = buffer;
865  // if stage count is even, output to the input array
866  Fout1 = Fout_ls;
867 
868  for (f_count = 0; f_count < fstride; f_count ++)
869  {
870  tw1 = tw;
871  for (m_count = mstride; m_count ; m_count --)
872  {
873  // load
874  scratch_tw[0] = *tw1;
875  tw2 = tw1 + mstride;
876  scratch_tw[1] = *tw2;
877  tw2 += mstride;
878  scratch_tw[2] = *tw2;
879  scratch_in[0] = * Fin1;
880  Fin2 = Fin1 + N;
881  scratch_in[1] = * Fin2;
882  Fin2 += N;
883  scratch_in[2] = * Fin2;
884  Fin2 += N;
885  scratch_in[3] = * Fin2;
886  if (scaled_flag == 1)
887  {
888  NE10_F2I32_FIXDIV (scratch_in[0], 4);
889  NE10_F2I32_FIXDIV (scratch_in[1], 4);
890  NE10_F2I32_FIXDIV (scratch_in[2], 4);
891  NE10_F2I32_FIXDIV (scratch_in[3], 4);
892  }
893 
894  // radix 4 butterfly with twiddles
895 
896  scratch[0] = scratch_in[0];
897  scratch[1].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[1].r * scratch_tw[0].r
898  + (NE10_F2I32_SAMPPROD) scratch_in[1].i * scratch_tw[0].i) >> 31);
899  scratch[1].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[1].i * scratch_tw[0].r
900  - (NE10_F2I32_SAMPPROD) scratch_in[1].r * scratch_tw[0].i) >> 31);
901 
902  scratch[2].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[2].r * scratch_tw[1].r
903  + (NE10_F2I32_SAMPPROD) scratch_in[2].i * scratch_tw[1].i) >> 31);
904  scratch[2].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[2].i * scratch_tw[1].r
905  - (NE10_F2I32_SAMPPROD) scratch_in[2].r * scratch_tw[1].i) >> 31);
906 
907  scratch[3].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[3].r * scratch_tw[2].r
908  + (NE10_F2I32_SAMPPROD) scratch_in[3].i * scratch_tw[2].i) >> 31);
909  scratch[3].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[3].i * scratch_tw[2].r
910  - (NE10_F2I32_SAMPPROD) scratch_in[3].r * scratch_tw[2].i) >> 31);
911 
912  // radix 2 butterfly
913  scratch[4].r = scratch[0].r + scratch[2].r;
914  scratch[4].i = scratch[0].i + scratch[2].i;
915 
916  scratch[5].r = scratch[0].r - scratch[2].r;
917  scratch[5].i = scratch[0].i - scratch[2].i;
918 
919  // radix 2 butterfly
920  scratch[6].r = scratch[1].r + scratch[3].r;
921  scratch[6].i = scratch[1].i + scratch[3].i;
922 
923  scratch[7].r = scratch[1].r - scratch[3].r;
924  scratch[7].i = scratch[1].i - scratch[3].i;
925 
926  // third result
927  scratch_out[2].r = scratch[4].r - scratch[6].r;
928  scratch_out[2].i = scratch[4].i - scratch[6].i;
929 
930  // first result
931  scratch_out[0].r = scratch[4].r + scratch[6].r;
932  scratch_out[0].i = scratch[4].i + scratch[6].i;
933 
934  // second result
935  scratch_out[1].r = scratch[5].r - scratch[7].i;
936  scratch_out[1].i = scratch[5].i + scratch[7].r;
937 
938  // forth result
939  scratch_out[3].r = scratch[5].r + scratch[7].i;
940  scratch_out[3].i = scratch[5].i - scratch[7].r;
941 
942  // store
943  *Fout1 = scratch_out[0];
944  Fout2 = Fout1 + N;
945  *Fout2 = scratch_out[1];
946  Fout2 += N;
947  *Fout2 = scratch_out[2];
948  Fout2 += N;
949  *Fout2 = scratch_out[3];
950 
951  tw1 ++;
952  Fin1 ++;
953  Fout1 ++;
954  } // m_count
955  } // f_count
956  } // last stage
957 }
958 
959 static void ne10_fft_split_r2c_1d_int32 (ne10_fft_cpx_int32_t *dst,
960  const ne10_fft_cpx_int32_t *src,
961  ne10_fft_cpx_int32_t *twiddles,
962  ne10_int32_t ncfft,
963  ne10_int32_t scaled_flag)
964 {
965  ne10_int32_t k;
966  ne10_fft_cpx_int32_t fpnk, fpk, f1k, f2k, tw, tdc;
967 
968  tdc.r = src[0].r;
969  tdc.i = src[0].i;
970 
971  if (scaled_flag)
972  NE10_F2I32_FIXDIV (tdc, 2);
973 
974  dst[0].r = tdc.r + tdc.i;
975  dst[ncfft].r = tdc.r - tdc.i;
976  dst[ncfft].i = dst[0].i = 0;
977 
978  for (k = 1; k <= ncfft / 2 ; ++k)
979  {
980  fpk = src[k];
981  fpnk.r = src[ncfft - k].r;
982  fpnk.i = - src[ncfft - k].i;
983  if (scaled_flag)
984  {
985  NE10_F2I32_FIXDIV (fpk, 2);
986  NE10_F2I32_FIXDIV (fpnk, 2);
987  }
988 
989  f1k.r = fpk.r + fpnk.r;
990  f1k.i = fpk.i + fpnk.i;
991 
992  f2k.r = fpk.r - fpnk.r;
993  f2k.i = fpk.i - fpnk.i;
994 
995  tw.r = ( ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) f2k.r * (twiddles[k - 1]).r) >> 32)) - ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) f2k.i * (twiddles[k - 1]).i) >> 32))) << 1;
996  tw.i = ( ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) f2k.r * (twiddles[k - 1]).i) >> 32)) + ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) f2k.i * (twiddles[k - 1]).r) >> 32))) << 1;
997 
998  dst[k].r = (f1k.r + tw.r) >> 1;
999  dst[k].i = (f1k.i + tw.i) >> 1;
1000  dst[ncfft - k].r = (f1k.r - tw.r) >> 1;
1001  dst[ncfft - k].i = (tw.i - f1k.i) >> 1;
1002  }
1003 }
1004 
1005 static void ne10_fft_split_c2r_1d_int32 (ne10_fft_cpx_int32_t *dst,
1006  const ne10_fft_cpx_int32_t *src,
1007  ne10_fft_cpx_int32_t *twiddles,
1008  ne10_int32_t ncfft,
1009  ne10_int32_t scaled_flag)
1010 {
1011 
1012  ne10_int32_t k;
1013  ne10_fft_cpx_int32_t fk, fnkc, fek, fok, tmp;
1014 
1015 
1016  dst[0].r = src[0].r + src[ncfft].r;
1017  dst[0].i = src[0].r - src[ncfft].r;
1018 
1019  if (scaled_flag)
1020  NE10_F2I32_FIXDIV (dst[0], 2);
1021 
1022  for (k = 1; k <= ncfft / 2; k++)
1023  {
1024  fk = src[k];
1025  fnkc.r = src[ncfft - k].r;
1026  fnkc.i = -src[ncfft - k].i;
1027  if (scaled_flag)
1028  {
1029  NE10_F2I32_FIXDIV (fk, 2);
1030  NE10_F2I32_FIXDIV (fnkc, 2);
1031  }
1032 
1033  fek.r = fk.r + fnkc.r;
1034  fek.i = fk.i + fnkc.i;
1035 
1036  tmp.r = fk.r - fnkc.r;
1037  tmp.i = fk.i - fnkc.i;
1038 
1039  fok.r = ( ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) tmp.r * (twiddles[k - 1]).r) >> 32)) + ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) tmp.i * (twiddles[k - 1]).i) >> 32))) << 1;
1040  fok.i = ( ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) tmp.i * (twiddles[k - 1]).r) >> 32)) - ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) tmp.r * (twiddles[k - 1]).i) >> 32))) << 1;
1041 
1042  dst[k].r = fek.r + fok.r;
1043  dst[k].i = fek.i + fok.i;
1044 
1045  dst[ncfft - k].r = fek.r - fok.r;
1046  dst[ncfft - k].i = fok.i - fek.i;
1047  }
1048 }
1049 
1055 {
1056  ne10_fft_cfg_int32_t st = NULL;
1057  ne10_uint32_t memneeded = sizeof (ne10_fft_state_int32_t)
1058  + sizeof (ne10_int32_t) * (NE10_MAXFACTORS * 2) /* factors */
1059  + sizeof (ne10_fft_cpx_int32_t) * nfft /* twiddles */
1060  + sizeof (ne10_fft_cpx_int32_t) * nfft /* buffer */
1061  + NE10_FFT_BYTE_ALIGNMENT; /* 64-bit alignment */
1062 
1063  st = (ne10_fft_cfg_int32_t) NE10_MALLOC (memneeded);
1064  if (st)
1065  {
1066  uintptr_t address = (uintptr_t) st + sizeof (ne10_fft_state_int32_t);
1068  st->factors = (ne10_int32_t*) address;
1069  st->twiddles = (ne10_fft_cpx_int32_t*) (st->factors + (NE10_MAXFACTORS * 2));
1070  st->buffer = st->twiddles + nfft;
1071  st->nfft = nfft;
1072 
1074  if (result == NE10_ERR)
1075  {
1076  NE10_FREE (st);
1077  return NULL;
1078  }
1079 
1080  // Disable radix 8 for generic INT32 FFTs (it isn't supported)
1081  {
1082  ne10_int32_t stage_count = st->factors[0];
1083  ne10_int32_t algorithm_flag = st->factors[2 * (stage_count + 1)];
1084 
1085  if (algorithm_flag == NE10_FFT_ALG_ANY)
1086  {
1087  result = ne10_factor (st->nfft, st->factors, NE10_FACTOR_DEFAULT);
1088  if (result == NE10_ERR)
1089  {
1090  NE10_FREE (st);
1091  return NULL;
1092  }
1093  }
1094  }
1095 
1097  }
1098 
1099  return st;
1100 }
1101 
1107  ne10_fft_cpx_int32_t *fin,
1109  ne10_int32_t inverse_fft,
1110  ne10_int32_t scaled_flag)
1111 {
1112  ne10_int32_t stage_count = cfg->factors[0];
1113  ne10_int32_t algorithm_flag = cfg->factors[2 * (stage_count + 1)];
1114 
1115  assert ((algorithm_flag == NE10_FFT_ALG_DEFAULT)
1116  || (algorithm_flag == NE10_FFT_ALG_ANY));
1117 
1118  switch (algorithm_flag)
1119  {
1120  case NE10_FFT_ALG_DEFAULT:
1121  if (inverse_fft)
1122  {
1123  ne10_mixed_radix_butterfly_inverse_int32_c (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer, scaled_flag);
1124  }
1125  else
1126  {
1127  ne10_mixed_radix_butterfly_int32_c (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer, scaled_flag);
1128  }
1129  break;
1130  case NE10_FFT_ALG_ANY:
1131  if (inverse_fft)
1132  {
1133  ne10_mixed_radix_generic_butterfly_inverse_int32_c (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer, scaled_flag);
1134  }
1135  else
1136  {
1137  ne10_mixed_radix_generic_butterfly_int32_c (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer, scaled_flag);
1138  }
1139  break;
1140  }
1141 }
1142 
1158 {
1159  ne10_fft_r2c_cfg_int32_t st = NULL;
1160  ne10_int32_t ncfft = nfft >> 1;
1161 
1162  ne10_uint32_t memneeded = sizeof (ne10_fft_r2c_state_int32_t)
1163  + sizeof (ne10_int32_t) * (NE10_MAXFACTORS * 2) /* factors */
1164  + sizeof (ne10_fft_cpx_int32_t) * ncfft /* twiddles */
1165  + sizeof (ne10_fft_cpx_int32_t) * ncfft / 2 /* super twiddles */
1166  + sizeof (ne10_fft_cpx_int32_t) * nfft /* buffer */
1167  + NE10_FFT_BYTE_ALIGNMENT; /* 64-bit alignment */
1168 
1169  st = (ne10_fft_r2c_cfg_int32_t) NE10_MALLOC (memneeded);
1170 
1171  if (st)
1172  {
1173  uintptr_t address = (uintptr_t) st + sizeof (ne10_fft_r2c_state_int32_t);
1174  NE10_BYTE_ALIGNMENT (address, NE10_FFT_BYTE_ALIGNMENT);
1175  st->factors = (ne10_int32_t*) address;
1176  st->twiddles = (ne10_fft_cpx_int32_t*) (st->factors + (NE10_MAXFACTORS * 2));
1177  st->super_twiddles = st->twiddles + ncfft;
1178  st->buffer = st->super_twiddles + (ncfft / 2);
1179  st->ncfft = ncfft;
1180 
1182  if (result == NE10_ERR)
1183  {
1184  NE10_FREE (st);
1185  return NULL;
1186  }
1187 
1188  ne10_int32_t j, k;
1189  ne10_int32_t *factors = st->factors;
1190  ne10_fft_cpx_int32_t *twiddles = st->twiddles;
1191  ne10_int32_t stage_count = factors[0];
1192  ne10_int32_t fstride = factors[1];
1193  ne10_int32_t mstride;
1194  ne10_int32_t cur_radix;
1195  ne10_float32_t phase;
1196  const ne10_float32_t pi = NE10_PI;
1197 
1198  // Don't generate any twiddles for the first stage
1199  stage_count --;
1200 
1201  // Generate twiddles for the other stages
1202  for (; stage_count > 0; stage_count --)
1203  {
1204  cur_radix = factors[2 * stage_count];
1205  fstride /= cur_radix;
1206  mstride = factors[2 * stage_count + 1];
1207  for (j = 0; j < mstride; j++)
1208  {
1209  for (k = 1; k < cur_radix; k++) // phase = 1 when k = 0
1210  {
1211  phase = -2 * pi * fstride * k * j / ncfft;
1212  twiddles[mstride * (k - 1) + j].r = (ne10_int32_t) floor (0.5f + NE10_F2I32_MAX * cos (phase));
1213  twiddles[mstride * (k - 1) + j].i = (ne10_int32_t) floor (0.5f + NE10_F2I32_MAX * sin (phase));
1214  }
1215  }
1216  twiddles += mstride * (cur_radix - 1);
1217  }
1218 
1219  twiddles = st->super_twiddles;
1220  for (j = 0; j < ncfft / 2; j++)
1221  {
1222  phase = -pi * ( (ne10_float32_t) (j + 1) / ncfft + 0.5f);
1223  twiddles->r = (ne10_int32_t) floor (0.5f + NE10_F2I32_MAX * cos (phase));
1224  twiddles->i = (ne10_int32_t) floor (0.5f + NE10_F2I32_MAX * sin (phase));
1225  twiddles++;
1226  }
1227  }
1228 
1229  return st;
1230 }
1231 
1237  ne10_int32_t *fin,
1239  ne10_int32_t scaled_flag)
1240 {
1241  ne10_fft_cpx_int32_t * tmpbuf = cfg->buffer;
1242 
1243  ne10_mixed_radix_butterfly_int32_c (tmpbuf, (ne10_fft_cpx_int32_t*) fin, cfg->factors, cfg->twiddles, fout, scaled_flag);
1244  ne10_fft_split_r2c_1d_int32 (fout, tmpbuf, cfg->super_twiddles, cfg->ncfft, scaled_flag);
1245 }
1246 
1252  ne10_fft_cpx_int32_t *fin,
1254  ne10_int32_t scaled_flag)
1255 {
1256  ne10_fft_cpx_int32_t * tmpbuf1 = cfg->buffer;
1257  ne10_fft_cpx_int32_t * tmpbuf2 = cfg->buffer + cfg->ncfft;
1258 
1259  ne10_fft_split_c2r_1d_int32 (tmpbuf1, fin, cfg->super_twiddles, cfg->ncfft, scaled_flag);
1260  ne10_mixed_radix_butterfly_inverse_int32_c ( (ne10_fft_cpx_int32_t*) fout, tmpbuf1, cfg->factors, cfg->twiddles, tmpbuf2, scaled_flag);
1261 }
#define NE10_FFT_ALG_DEFAULT
Definition: NE10_fft.h:57
ne10_fft_cpx_int32_t * twiddles
Definition: NE10_types.h:335
#define NE10_MAXFACTORS
Structure for the floating point FFT function.
Definition: NE10_types.h:229
int32_t ne10_int32_t
Definition: NE10_types.h:76
ne10_int32_t * factors
Definition: NE10_types.h:346
ne10_int32_t ne10_factor(ne10_int32_t n, ne10_int32_t *facbuf, ne10_int32_t ne10_factor_flags)
Definition: NE10_fft.c:71
float ne10_float32_t
Definition: NE10_types.h:80
#define NE10_FFT_BYTE_ALIGNMENT
Definition: NE10_fft.h:45
void ne10_fft_r2c_1d_int32_c(ne10_fft_cpx_int32_t *fout, ne10_int32_t *fin, ne10_fft_r2c_cfg_int32_t cfg, ne10_int32_t scaled_flag)
Specific implementation of ne10_fft_r2c_1d_int32 using plain C.
ne10_fft_cpx_int32_t * ne10_fft_generate_twiddles_int32(ne10_fft_cpx_int32_t *twiddles, const ne10_int32_t *factors, const ne10_int32_t nfft)
Definition: NE10_fft.c:246
void ne10_mixed_radix_generic_butterfly_int32_c(ne10_fft_cpx_int32_t *Fout, const ne10_fft_cpx_int32_t *Fin, const ne10_int32_t *factors, const ne10_fft_cpx_int32_t *twiddles, ne10_fft_cpx_int32_t *buffer, const ne10_int32_t scaled_flag)
Generic (forward) FFT function for 32-bit fixed point.
#define NE10_F2I32_SAMPPROD
Definition: NE10_macros.h:83
uint32_t ne10_uint32_t
Definition: NE10_types.h:77
#define NE10_PI
NE10 defines a number of macros for use in its function signatures.
Definition: NE10_macros.h:47
#define NE10_F2I32_FIXDIV(c, div)
Definition: NE10_macros.h:87
ne10_fft_r2c_state_int32_t * ne10_fft_r2c_cfg_int32_t
Definition: NE10_types.h:352
void ne10_fft_c2c_1d_int32_c(ne10_fft_cpx_int32_t *fout, ne10_fft_cpx_int32_t *fin, ne10_fft_cfg_int32_t cfg, ne10_int32_t inverse_fft, ne10_int32_t scaled_flag)
Specific implementation of ne10_fft_c2c_1d_int32 using plain C.
#define NE10_FREE(p)
Definition: NE10_macros.h:54
Structure for the 32-bit fixed point FFT function.
Definition: NE10_types.h:325
ne10_int32_t i
Definition: NE10_types.h:328
ne10_int32_t * factors
Definition: NE10_types.h:334
#define NE10_FACTOR_EIGHT_FIRST_STAGE
Definition: NE10_fft.h:72
ne10_fft_r2c_cfg_int32_t ne10_fft_alloc_r2c_int32(ne10_int32_t nfft)
Creates a configuration structure for variants of ne10_fft_r2c_1d_int32 and ne10_fft_c2r_1d_int32.
#define NE10_FFT_ALG_ANY
Definition: NE10_fft.h:58
#define NE10_F2I32_MAX
Definition: NE10_macros.h:81
ne10_fft_cpx_int32_t * twiddles
Definition: NE10_types.h:347
ne10_fft_cpx_int32_t * buffer
Definition: NE10_types.h:336
#define NE10_MALLOC
Definition: NE10_macros.h:53
#define NE10_BYTE_ALIGNMENT(address, alignment)
Definition: NE10_macros.h:63
ne10_fft_cpx_int32_t * buffer
Definition: NE10_types.h:349
#define NE10_ERR
Definition: NE10_types.h:66
ne10_int32_t r
Definition: NE10_types.h:327
ne10_fft_cfg_int32_t ne10_fft_alloc_c2c_int32_c(ne10_int32_t nfft)
Specific implementation of ne10_fft_alloc_c2c_int32 for ne10_fft_c2c_1d_int32_c.
void ne10_fft_c2r_1d_int32_c(ne10_int32_t *fout, ne10_fft_cpx_int32_t *fin, ne10_fft_r2c_cfg_int32_t cfg, ne10_int32_t scaled_flag)
Specific implementation of ne10_fft_c2r_1d_int32 using plain C.
ne10_fft_cpx_int32_t * super_twiddles
Definition: NE10_types.h:348
void ne10_mixed_radix_generic_butterfly_inverse_int32_c(ne10_fft_cpx_int32_t *Fout, const ne10_fft_cpx_int32_t *Fin, const ne10_int32_t *factors, const ne10_fft_cpx_int32_t *twiddles, ne10_fft_cpx_int32_t *buffer, const ne10_int32_t scaled_flag)
Generic IFFT function for 32-bit fixed point.
#define NE10_FACTOR_DEFAULT
Definition: NE10_fft.h:71
ne10_fft_state_int32_t * ne10_fft_cfg_int32_t
Definition: NE10_types.h:340