Project Ne10
An open, optimized software library for the ARM architecture.
NE10_fft_generic_float32.h
Go to the documentation of this file.
1 /*
2  * Copyright 2014-16 ARM Limited and Contributors.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of ARM Limited nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /* license of Kiss FFT */
29 /*
30 Copyright (c) 2003-2010, Mark Borgerding
31 
32 All rights reserved.
33 
34 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
35 
36  * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
37  * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
38  * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
39 
40 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 */
42 
43 /*
44  * NE10 Library : dsp/NE10_fft_generic_float32.h
45  */
46 
47 #ifndef NE10_FFT_GENERIC_FLOAT32_H
48 #define NE10_FFT_GENERIC_FLOAT32_H
49 
50 #include "NE10_types.h"
51 #include "NE10_macros.h"
52 #include "NE10_fft.h"
53 
55 // Multiply input with twiddles
57 static inline void FFT2_MUL_TW (ne10_fft_cpx_float32_t scratch_out[2],
58  const ne10_fft_cpx_float32_t scratch_in[2],
59  const ne10_fft_cpx_float32_t scratch_tw[1])
60 {
61  scratch_out[0] = scratch_in[0];
62  NE10_CPX_MUL_F32 (scratch_out[1], scratch_in[1], scratch_tw[0]);
63 }
64 
65 static inline void FFT3_MUL_TW (ne10_fft_cpx_float32_t scratch_out[3],
66  const ne10_fft_cpx_float32_t scratch_in[3],
67  const ne10_fft_cpx_float32_t scratch_tw[2])
68 {
69  FFT2_MUL_TW (scratch_out, scratch_in, scratch_tw);
70  NE10_CPX_MUL_F32 (scratch_out[2], scratch_in[2], scratch_tw[1]);
71 }
72 
73 static inline void FFT4_MUL_TW (ne10_fft_cpx_float32_t scratch_out[4],
74  const ne10_fft_cpx_float32_t scratch_in[4],
75  const ne10_fft_cpx_float32_t scratch_tw[3])
76 {
77  FFT3_MUL_TW (scratch_out, scratch_in, scratch_tw);
78  NE10_CPX_MUL_F32 (scratch_out[3], scratch_in[3], scratch_tw[2]);
79 }
80 
81 static inline void FFT8_FCU (ne10_fft_cpx_float32_t out[8],
82  const ne10_fft_cpx_float32_t in[8])
83 {
85 
86  const static ne10_fft_cpx_float32_t TW_8[4] = {
87  { 1.00000, 0.00000 },
88  { 0.70711, -0.70711 },
89  { 0.00000, -1.00000 },
90  { -0.70711, -0.70711 },
91  };
92 
93 #define NE10_BUTTERFLY_INDEX_F32(OUT,IN,OUT_I,OUT_J,IN_I,IN_J) \
94  do { \
95  NE10_CPX_ADD (OUT[OUT_I],IN[IN_I],IN[IN_J]); \
96  NE10_CPX_SUB (OUT[OUT_J],IN[IN_I],IN[IN_J]); \
97  } while (0)
98 
99  // STAGE - 1
100  // in -> s
101  {
102  NE10_BUTTERFLY_INDEX_F32 (s,in,0,4,0,4);
103  NE10_BUTTERFLY_INDEX_F32 (s,in,1,5,1,5);
104  NE10_BUTTERFLY_INDEX_F32 (s,in,2,6,2,6);
105  NE10_BUTTERFLY_INDEX_F32 (s,in,3,7,3,7);
106  }
107 
108  // STAGE - 2
109  // s -> out
110  {
111  // TW
112 #define NE10_CPX_MUL_TW8_F32(OUT,TW_8_TABLE,OUT_I,TW_J) \
113  do { \
114  ne10_fft_cpx_float32_t TW_TMP = TW_8_TABLE[TW_J]; \
115  NE10_CPX_MUL_F32 (OUT[OUT_I],OUT[OUT_I],TW_TMP); \
116  } while (0)
117 
118  NE10_CPX_MUL_TW8_F32 (s,TW_8,4,0);
119  NE10_CPX_MUL_TW8_F32 (s,TW_8,5,1);
120  NE10_CPX_MUL_TW8_F32 (s,TW_8,6,2);
121  NE10_CPX_MUL_TW8_F32 (s,TW_8,7,3);
122 
123  NE10_BUTTERFLY_INDEX_F32 (out,s,0,2,0,2);
124  NE10_BUTTERFLY_INDEX_F32 (out,s,1,3,1,3);
125  NE10_BUTTERFLY_INDEX_F32 (out,s,4,6,4,6);
126  NE10_BUTTERFLY_INDEX_F32 (out,s,5,7,5,7);
127  }
128  // STAGE - 3
129  // out -> s
130  {
131  // TW
132  NE10_CPX_MUL_TW8_F32 (out,TW_8,2,0);
133  NE10_CPX_MUL_TW8_F32 (out,TW_8,3,2);
134  NE10_CPX_MUL_TW8_F32 (out,TW_8,6,0);
135  NE10_CPX_MUL_TW8_F32 (out,TW_8,7,2);
136 #undef NE10_CPX_MUL_TW8_F32
137 
138  NE10_BUTTERFLY_INDEX_F32 (s,out,0,4,0,1);
139  NE10_BUTTERFLY_INDEX_F32 (s,out,2,6,2,3);
140  NE10_BUTTERFLY_INDEX_F32 (s,out,1,5,4,5);
141  NE10_BUTTERFLY_INDEX_F32 (s,out,3,7,6,7);
142  }
143 
144  out[0] = s[0];
145  out[1] = s[1];
146  out[2] = s[2];
147  out[3] = s[3];
148  out[4] = s[4];
149  out[5] = s[5];
150  out[6] = s[6];
151  out[7] = s[7];
152 }
153 
154 static inline void FFT5_MUL_TW (ne10_fft_cpx_float32_t scratch_out[5],
155  const ne10_fft_cpx_float32_t scratch_in[5],
156  const ne10_fft_cpx_float32_t scratch_tw[4])
157 {
158  FFT4_MUL_TW (scratch_out, scratch_in, scratch_tw);
159  NE10_CPX_MUL_F32 (scratch_out[4], scratch_in[4], scratch_tw[3]);
160 }
161 
163 // FFT Kernel
164 // F: Forward
165 // C: Complex
166 // U: Unscaled
168 static inline void FFT2_FCU (ne10_fft_cpx_float32_t scratch_out[2],
169  const ne10_fft_cpx_float32_t scratch_in[2])
170 {
171  NE10_CPX_ADD (scratch_out[0], scratch_in[0], scratch_in[1]);
172  NE10_CPX_SUB (scratch_out[1], scratch_in[0], scratch_in[1]);
173 }
174 
175 static inline void FFT3_FCU (ne10_fft_cpx_float32_t Fout[3],
176  const ne10_fft_cpx_float32_t Fin[3])
177 {
178  ne10_fft_cpx_float32_t scratch[4];
179  ne10_fft_cpx_float32_t scratch_in[3];
180 
181  scratch_in[0] = Fin[0];
182  scratch_in[1] = Fin[1];
183  scratch_in[2] = Fin[2];
184 
185  scratch[1] = scratch_in[1];
186  scratch[2] = scratch_in[2];
187 
188  NE10_CPX_ADD (scratch[3], scratch[1], scratch[2]);
189  NE10_CPX_SUB (scratch[0], scratch[1], scratch[2]);
190 
191  scratch_in[1].r = scratch_in[0].r - scratch[3].r * 0.5;
192  scratch_in[1].i = scratch_in[0].i - scratch[3].i * 0.5;
193 
194  scratch[0].r *= -TW_3I_F32;
195  scratch[0].i *= -TW_3I_F32;
196 
197  scratch_in[0].r += scratch[3].r;
198  scratch_in[0].i += scratch[3].i;
199 
200  scratch_in[2].r = scratch_in[1].r + scratch[0].i;
201  scratch_in[2].i = scratch_in[1].i - scratch[0].r;
202 
203  scratch_in[1].r -= scratch[0].i;
204  scratch_in[1].i += scratch[0].r;
205 
206  Fout[0] = scratch_in[0];
207  Fout[1] = scratch_in[1];
208  Fout[2] = scratch_in[2];
209 }
210 
211 static inline void FFT4_FCU (ne10_fft_cpx_float32_t scratch_out[4],
212  const ne10_fft_cpx_float32_t scratch_in[4])
213 {
214  ne10_fft_cpx_float32_t scratch[4];
215 
216  NE10_CPX_ADD (scratch[0], scratch_in[0], scratch_in[2]);
217  NE10_CPX_SUB (scratch[1], scratch_in[0], scratch_in[2]);
218  NE10_CPX_ADD (scratch[2], scratch_in[1], scratch_in[3]);
219  NE10_CPX_SUB (scratch[3], scratch_in[1], scratch_in[3]);
220 
221  NE10_CPX_SUB (scratch_out[2], scratch[0], scratch[2]);
222  NE10_CPX_ADD (scratch_out[0], scratch[0], scratch[2]);
223 
224  scratch_out[1].r = scratch[1].r + scratch[3].i;
225  scratch_out[1].i = scratch[1].i - scratch[3].r;
226  scratch_out[3].r = scratch[1].r - scratch[3].i;
227  scratch_out[3].i = scratch[1].i + scratch[3].r;
228 }
229 
230 static inline void FFT4_FCU_INPLACE (ne10_fft_cpx_float32_t scratch_out[4])
231 {
232  ne10_fft_cpx_float32_t scratch[4];
233 
234  NE10_CPX_ADD (scratch[0], scratch_out[0], scratch_out[2]);
235  NE10_CPX_SUB (scratch[1], scratch_out[0], scratch_out[2]);
236  NE10_CPX_ADD (scratch[2], scratch_out[1], scratch_out[3]);
237  NE10_CPX_SUB (scratch[3], scratch_out[1], scratch_out[3]);
238 
239  NE10_CPX_SUB (scratch_out[2], scratch[0], scratch[2]);
240  NE10_CPX_ADD (scratch_out[0], scratch[0], scratch[2]);
241 
242  scratch_out[1].r = scratch[1].r + scratch[3].i;
243  scratch_out[1].i = scratch[1].i - scratch[3].r;
244  scratch_out[3].r = scratch[1].r - scratch[3].i;
245  scratch_out[3].i = scratch[1].i + scratch[3].r;
246 }
247 
248 static inline void FFT5_FCU (ne10_fft_cpx_float32_t Fout[5],
249  const ne10_fft_cpx_float32_t Fin[5])
250 {
251  ne10_fft_cpx_float32_t scratch[13], scratch_in[5];
252 
253  scratch_in[0] = Fin[0];
254  scratch_in[1] = Fin[1];
255  scratch_in[2] = Fin[2];
256  scratch_in[3] = Fin[3];
257  scratch_in[4] = Fin[4];
258 
259  scratch[0] = scratch_in[0];
260  scratch[1] = scratch_in[1];
261  scratch[2] = scratch_in[2];
262  scratch[3] = scratch_in[3];
263  scratch[4] = scratch_in[4];
264 
265  NE10_CPX_ADD (scratch[ 7], scratch[1], scratch[4]);
266  NE10_CPX_SUB (scratch[10], scratch[1], scratch[4]);
267  NE10_CPX_ADD (scratch[ 8], scratch[2], scratch[3]);
268  NE10_CPX_SUB (scratch[ 9], scratch[2], scratch[3]);
269 
270  scratch_in[0].r += scratch[7].r + scratch[8].r;
271  scratch_in[0].i += scratch[7].i + scratch[8].i;
272 
273  scratch[5].r = scratch[0].r
274  + NE10_S_MUL (scratch[7].r, TW_5A_F32.r)
275  + NE10_S_MUL (scratch[8].r, TW_5B_F32.r);
276  scratch[5].i = scratch[0].i
277  + NE10_S_MUL (scratch[7].i, TW_5A_F32.r)
278  + NE10_S_MUL (scratch[8].i, TW_5B_F32.r);
279 
280  scratch[6].r = NE10_S_MUL (scratch[10].i, TW_5A_F32.i)
281  + NE10_S_MUL (scratch[9].i, TW_5B_F32.i);
282  scratch[6].i = -NE10_S_MUL (scratch[10].r, TW_5A_F32.i)
283  - NE10_S_MUL (scratch[9].r, TW_5B_F32.i);
284 
285  NE10_CPX_SUB (scratch_in[1], scratch[5], scratch[6]);
286  NE10_CPX_ADD (scratch_in[4], scratch[5], scratch[6]);
287 
288  scratch[11].r = scratch[0].r
289  + NE10_S_MUL (scratch[7].r, TW_5B_F32.r)
290  + NE10_S_MUL (scratch[8].r, TW_5A_F32.r);
291  scratch[11].i = scratch[0].i
292  + NE10_S_MUL (scratch[7].i, TW_5B_F32.r)
293  + NE10_S_MUL (scratch[8].i, TW_5A_F32.r);
294 
295  scratch[12].r = -NE10_S_MUL (scratch[10].i, TW_5B_F32.i)
296  + NE10_S_MUL (scratch[9].i, TW_5A_F32.i);
297  scratch[12].i = NE10_S_MUL (scratch[10].r, TW_5B_F32.i)
298  - NE10_S_MUL (scratch[9].r, TW_5A_F32.i);
299 
300  NE10_CPX_ADD (scratch_in[2], scratch[11], scratch[12]);
301  NE10_CPX_SUB (scratch_in[3], scratch[11], scratch[12]);
302 
303  Fout[0] = scratch_in[0];
304  Fout[1] = scratch_in[1];
305  Fout[2] = scratch_in[2];
306  Fout[3] = scratch_in[3];
307  Fout[4] = scratch_in[4];
308 }
309 #endif // NE10_FFT_GENERIC_FLOAT32_H
#define NE10_BUTTERFLY_INDEX_F32(OUT, IN, OUT_I, OUT_J, IN_I, IN_J)
#define NE10_CPX_MUL_TW8_F32(OUT, TW_8_TABLE, OUT_I, TW_J)
#define NE10_S_MUL(X, Y)
#define NE10_CPX_ADD(Z, A, B)
#define NE10_CPX_MUL_F32(Z, A, B)
ne10_float32_t i
Definition: NE10_types.h:233
#define NE10_CPX_SUB(Z, A, B)
ne10_float32_t r
Definition: NE10_types.h:232