Project Ne10
An open, optimized software library for the ARM architecture.
NE10_fft_int32.neon.c
Go to the documentation of this file.
1 /*
2  * Copyright 2013-16 ARM Limited and Contributors.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of ARM Limited nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /*
29  * NE10 Library : dsp/NE10_fft_int32.neon.c
30  */
31 
32 #include <arm_neon.h>
33 
34 #include "NE10_types.h"
35 #include "NE10_macros.h"
36 #include "NE10_fft.h"
37 #include "NE10_dsp.h"
38 
39 static inline void ne10_fft4_forward_int32_unscaled (ne10_fft_cpx_int32_t * Fout,
41 
42 {
43  ne10_int32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i;
44  ne10_int32_t tmp_r, tmp_i;
45 
46  s2_r = Fin[0].r - Fin[2].r;
47  s2_i = Fin[0].i - Fin[2].i;
48 
49  tmp_r = Fin[0].r + Fin[2].r;
50  tmp_i = Fin[0].i + Fin[2].i;
51 
52  s0_r = Fin[1].r + Fin[3].r;
53  s0_i = Fin[1].i + Fin[3].i;
54 
55  s1_r = Fin[1].r - Fin[3].r;
56  s1_i = Fin[1].i - Fin[3].i;
57  Fout[2].r = tmp_r - s0_r;
58  Fout[2].i = tmp_i - s0_i;
59  Fout[0].r = tmp_r + s0_r;
60  Fout[0].i = tmp_i + s0_i;
61 
62  Fout[1].r = s2_r + s1_i;
63  Fout[1].i = s2_i - s1_r;
64  Fout[3].r = s2_r - s1_i;
65  Fout[3].i = s2_i + s1_r;
66 }
67 
68 static inline void ne10_fft4_backward_int32_unscaled (ne10_fft_cpx_int32_t * Fout,
70 
71 {
72  ne10_int32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i;
73  ne10_int32_t tmp_r, tmp_i;
74 
75  s2_r = Fin[0].r - Fin[2].r;
76  s2_i = Fin[0].i - Fin[2].i;
77 
78  tmp_r = Fin[0].r + Fin[2].r;
79  tmp_i = Fin[0].i + Fin[2].i;
80 
81  s0_r = Fin[1].r + Fin[3].r;
82  s0_i = Fin[1].i + Fin[3].i;
83 
84  s1_r = Fin[1].r - Fin[3].r;
85  s1_i = Fin[1].i - Fin[3].i;
86 
87  Fout[2].r = tmp_r - s0_r;
88  Fout[2].i = tmp_i - s0_i;
89  Fout[0].r = tmp_r + s0_r;
90  Fout[0].i = tmp_i + s0_i;
91 
92  Fout[1].r = s2_r - s1_i;
93  Fout[1].i = s2_i + s1_r;
94  Fout[3].r = s2_r + s1_i;
95  Fout[3].i = s2_i - s1_r;
96 }
97 static inline void ne10_fft4_forward_int32_scaled (ne10_fft_cpx_int32_t * Fout,
99 
100 {
101  ne10_int32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i;
102  ne10_int32_t tmp_r, tmp_i;
103 
104  s2_r = (Fin[0].r - Fin[2].r) >> 2;
105  s2_i = (Fin[0].i - Fin[2].i) >> 2;
106  tmp_r = (Fin[0].r + Fin[2].r) >> 2;
107  tmp_i = (Fin[0].i + Fin[2].i) >> 2;
108 
109  s0_r = (Fin[1].r + Fin[3].r) >> 2;
110  s0_i = (Fin[1].i + Fin[3].i) >> 2;
111  s1_r = (Fin[1].r - Fin[3].r) >> 2;
112  s1_i = (Fin[1].i - Fin[3].i) >> 2;
113 
114  Fout[2].r = tmp_r - s0_r;
115  Fout[2].i = tmp_i - s0_i;
116  Fout[0].r = tmp_r + s0_r;
117  Fout[0].i = tmp_i + s0_i;
118 
119  Fout[1].r = s2_r + s1_i;
120  Fout[1].i = s2_i - s1_r;
121  Fout[3].r = s2_r - s1_i;
122  Fout[3].i = s2_i + s1_r;
123 }
124 
125 static inline void ne10_fft4_backward_int32_scaled (ne10_fft_cpx_int32_t * Fout,
126  ne10_fft_cpx_int32_t * Fin)
127 
128 {
129  ne10_int32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i;
130  ne10_int32_t tmp_r, tmp_i;
131 
132  s2_r = (Fin[0].r - Fin[2].r) >> 2;
133  s2_i = (Fin[0].i - Fin[2].i) >> 2;
134  tmp_r = (Fin[0].r + Fin[2].r) >> 2;
135  tmp_i = (Fin[0].i + Fin[2].i) >> 2;
136 
137  s0_r = (Fin[1].r + Fin[3].r) >> 2;
138  s0_i = (Fin[1].i + Fin[3].i) >> 2;
139  s1_r = (Fin[1].r - Fin[3].r) >> 2;
140  s1_i = (Fin[1].i - Fin[3].i) >> 2;
141 
142  Fout[2].r = tmp_r - s0_r;
143  Fout[2].i = tmp_i - s0_i;
144  Fout[0].r = tmp_r + s0_r;
145  Fout[0].i = tmp_i + s0_i;
146 
147  Fout[1].r = s2_r - s1_i;
148  Fout[1].i = s2_i + s1_r;
149  Fout[3].r = s2_r + s1_i;
150  Fout[3].i = s2_i - s1_r;
151 }
152 static inline void ne10_fft8_forward_int32_unscaled (ne10_fft_cpx_int32_t * Fout,
153  ne10_fft_cpx_int32_t * Fin)
154 
155 {
156  ne10_int32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
157  ne10_int32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
158  const ne10_int32_t TW_81 = 1518500249;
159 
160  s0_r = Fin[0].r + Fin[4].r;
161  s0_i = Fin[0].i + Fin[4].i;
162  s1_r = Fin[0].r - Fin[4].r;
163  s1_i = Fin[0].i - Fin[4].i;
164  s2_r = Fin[1].r + Fin[5].r;
165  s2_i = Fin[1].i + Fin[5].i;
166  s3_r = Fin[1].r - Fin[5].r;
167  s3_i = Fin[1].i - Fin[5].i;
168  s4_r = Fin[2].r + Fin[6].r;
169  s4_i = Fin[2].i + Fin[6].i;
170  s5_r = Fin[2].r - Fin[6].r;
171  s5_i = Fin[2].i - Fin[6].i;
172  s6_r = Fin[3].r + Fin[7].r;
173  s6_i = Fin[3].i + Fin[7].i;
174  s7_r = Fin[3].r - Fin[7].r;
175  s7_i = Fin[3].i - Fin[7].i;
176 
177  t0_r = s0_r - s4_r;
178  t0_i = s0_i - s4_i;
179  t1_r = s0_r + s4_r;
180  t1_i = s0_i + s4_i;
181  t2_r = s2_r + s6_r;
182  t2_i = s2_i + s6_i;
183  t3_r = s2_r - s6_r;
184  t3_i = s2_i - s6_i;
185  Fout[0].r = t1_r + t2_r;
186  Fout[0].i = t1_i + t2_i;
187  Fout[4].r = t1_r - t2_r;
188  Fout[4].i = t1_i - t2_i;
189  Fout[2].r = t0_r + t3_i;
190  Fout[2].i = t0_i - t3_r;
191  Fout[6].r = t0_r - t3_i;
192  Fout[6].i = t0_i + t3_r;
193 
194  t4_r = (ne10_int32_t) ( ( (ne10_int64_t) (s3_r + s3_i) * TW_81) >> 31);
195  t4_i = - (ne10_int32_t) ( ( (ne10_int64_t) (s3_r - s3_i) * TW_81) >> 31);
196  t5_r = (ne10_int32_t) ( ( (ne10_int64_t) (s7_r - s7_i) * TW_81) >> 31);
197  t5_i = (ne10_int32_t) ( ( (ne10_int64_t) (s7_r + s7_i) * TW_81) >> 31);
198 
199  t0_r = s1_r - s5_i;
200  t0_i = s1_i + s5_r;
201  t1_r = s1_r + s5_i;
202  t1_i = s1_i - s5_r;
203  t2_r = t4_r - t5_r;
204  t2_i = t4_i - t5_i;
205  t3_r = t4_r + t5_r;
206  t3_i = t4_i + t5_i;
207  Fout[1].r = t1_r + t2_r;
208  Fout[1].i = t1_i + t2_i;
209  Fout[5].r = t1_r - t2_r;
210  Fout[5].i = t1_i - t2_i;
211  Fout[3].r = t0_r + t3_i;
212  Fout[3].i = t0_i - t3_r;
213  Fout[7].r = t0_r - t3_i;
214  Fout[7].i = t0_i + t3_r;
215 }
216 
217 static inline void ne10_fft8_backward_int32_unscaled (ne10_fft_cpx_int32_t * Fout,
218  ne10_fft_cpx_int32_t * Fin)
219 
220 {
221  ne10_int32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
222  ne10_int32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
223  const ne10_int32_t TW_81 = 1518500249;
224 
225  s0_r = Fin[0].r + Fin[4].r;
226  s0_i = Fin[0].i + Fin[4].i;
227  s1_r = Fin[0].r - Fin[4].r;
228  s1_i = Fin[0].i - Fin[4].i;
229  s2_r = Fin[1].r + Fin[5].r;
230  s2_i = Fin[1].i + Fin[5].i;
231  s3_r = Fin[1].r - Fin[5].r;
232  s3_i = Fin[1].i - Fin[5].i;
233  s4_r = Fin[2].r + Fin[6].r;
234  s4_i = Fin[2].i + Fin[6].i;
235  s5_r = Fin[2].r - Fin[6].r;
236  s5_i = Fin[2].i - Fin[6].i;
237  s6_r = Fin[3].r + Fin[7].r;
238  s6_i = Fin[3].i + Fin[7].i;
239  s7_r = Fin[3].r - Fin[7].r;
240  s7_i = Fin[3].i - Fin[7].i;
241 
242  t0_r = s0_r - s4_r;
243  t0_i = s0_i - s4_i;
244  t1_r = s0_r + s4_r;
245  t1_i = s0_i + s4_i;
246  t2_r = s2_r + s6_r;
247  t2_i = s2_i + s6_i;
248  t3_r = s2_r - s6_r;
249  t3_i = s2_i - s6_i;
250  Fout[0].r = t1_r + t2_r;
251  Fout[0].i = t1_i + t2_i;
252  Fout[4].r = t1_r - t2_r;
253  Fout[4].i = t1_i - t2_i;
254  Fout[2].r = t0_r - t3_i;
255  Fout[2].i = t0_i + t3_r;
256  Fout[6].r = t0_r + t3_i;
257  Fout[6].i = t0_i - t3_r;
258 
259  t4_r = (ne10_int32_t) ( ( (ne10_int64_t) (s3_r - s3_i) * TW_81) >> 31);
260  t4_i = (ne10_int32_t) ( ( (ne10_int64_t) (s3_r + s3_i) * TW_81) >> 31);
261  t5_r = (ne10_int32_t) ( ( (ne10_int64_t) (s7_r + s7_i) * TW_81) >> 31);
262  t5_i = - (ne10_int32_t) ( ( (ne10_int64_t) (s7_r - s7_i) * TW_81) >> 31);
263 
264  t0_r = s1_r + s5_i;
265  t0_i = s1_i - s5_r;
266  t1_r = s1_r - s5_i;
267  t1_i = s1_i + s5_r;
268  t2_r = t4_r - t5_r;
269  t2_i = t4_i - t5_i;
270  t3_r = t4_r + t5_r;
271  t3_i = t4_i + t5_i;
272  Fout[1].r = t1_r + t2_r;
273  Fout[1].i = t1_i + t2_i;
274  Fout[5].r = t1_r - t2_r;
275  Fout[5].i = t1_i - t2_i;
276  Fout[3].r = t0_r - t3_i;
277  Fout[3].i = t0_i + t3_r;
278  Fout[7].r = t0_r + t3_i;
279  Fout[7].i = t0_i - t3_r;
280 }
281 static inline void ne10_fft8_forward_int32_scaled (ne10_fft_cpx_int32_t * Fout,
282  ne10_fft_cpx_int32_t * Fin)
283 
284 {
285  ne10_int32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
286  ne10_int32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
287  const ne10_int32_t TW_81 = 1518500249;
288 
289  s0_r = (Fin[0].r + Fin[4].r) >> 3;
290  s0_i = (Fin[0].i + Fin[4].i) >> 3;
291  s1_r = (Fin[0].r - Fin[4].r) >> 3;
292  s1_i = (Fin[0].i - Fin[4].i) >> 3;
293  s2_r = (Fin[1].r + Fin[5].r) >> 3;
294  s2_i = (Fin[1].i + Fin[5].i) >> 3;
295  s3_r = (Fin[1].r - Fin[5].r) >> 3;
296  s3_i = (Fin[1].i - Fin[5].i) >> 3;
297  s4_r = (Fin[2].r + Fin[6].r) >> 3;
298  s4_i = (Fin[2].i + Fin[6].i) >> 3;
299  s5_r = (Fin[2].r - Fin[6].r) >> 3;
300  s5_i = (Fin[2].i - Fin[6].i) >> 3;
301  s6_r = (Fin[3].r + Fin[7].r) >> 3;
302  s6_i = (Fin[3].i + Fin[7].i) >> 3;
303  s7_r = (Fin[3].r - Fin[7].r) >> 3;
304  s7_i = (Fin[3].i - Fin[7].i) >> 3;
305 
306  t0_r = s0_r - s4_r;
307  t0_i = s0_i - s4_i;
308  t1_r = s0_r + s4_r;
309  t1_i = s0_i + s4_i;
310  t2_r = s2_r + s6_r;
311  t2_i = s2_i + s6_i;
312  t3_r = s2_r - s6_r;
313  t3_i = s2_i - s6_i;
314  Fout[0].r = t1_r + t2_r;
315  Fout[0].i = t1_i + t2_i;
316  Fout[4].r = t1_r - t2_r;
317  Fout[4].i = t1_i - t2_i;
318  Fout[2].r = t0_r + t3_i;
319  Fout[2].i = t0_i - t3_r;
320  Fout[6].r = t0_r - t3_i;
321  Fout[6].i = t0_i + t3_r;
322 
323  t4_r = (ne10_int32_t) ( ( (ne10_int64_t) (s3_r + s3_i) * TW_81) >> 31);
324  t4_i = - (ne10_int32_t) ( ( (ne10_int64_t) (s3_r - s3_i) * TW_81) >> 31);
325  t5_r = (ne10_int32_t) ( ( (ne10_int64_t) (s7_r - s7_i) * TW_81) >> 31);
326  t5_i = (ne10_int32_t) ( ( (ne10_int64_t) (s7_r + s7_i) * TW_81) >> 31);
327 
328  t0_r = s1_r - s5_i;
329  t0_i = s1_i + s5_r;
330  t1_r = s1_r + s5_i;
331  t1_i = s1_i - s5_r;
332  t2_r = t4_r - t5_r;
333  t2_i = t4_i - t5_i;
334  t3_r = t4_r + t5_r;
335  t3_i = t4_i + t5_i;
336  Fout[1].r = t1_r + t2_r;
337  Fout[1].i = t1_i + t2_i;
338  Fout[5].r = t1_r - t2_r;
339  Fout[5].i = t1_i - t2_i;
340  Fout[3].r = t0_r + t3_i;
341  Fout[3].i = t0_i - t3_r;
342  Fout[7].r = t0_r - t3_i;
343  Fout[7].i = t0_i + t3_r;
344 }
345 
346 static inline void ne10_fft8_backward_int32_scaled (ne10_fft_cpx_int32_t * Fout,
347  ne10_fft_cpx_int32_t * Fin)
348 
349 {
350  ne10_int32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
351  ne10_int32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
352  const ne10_int32_t TW_81 = 1518500249;
353 
354  s0_r = (Fin[0].r + Fin[4].r) >> 3;
355  s0_i = (Fin[0].i + Fin[4].i) >> 3;
356  s1_r = (Fin[0].r - Fin[4].r) >> 3;
357  s1_i = (Fin[0].i - Fin[4].i) >> 3;
358  s2_r = (Fin[1].r + Fin[5].r) >> 3;
359  s2_i = (Fin[1].i + Fin[5].i) >> 3;
360  s3_r = (Fin[1].r - Fin[5].r) >> 3;
361  s3_i = (Fin[1].i - Fin[5].i) >> 3;
362  s4_r = (Fin[2].r + Fin[6].r) >> 3;
363  s4_i = (Fin[2].i + Fin[6].i) >> 3;
364  s5_r = (Fin[2].r - Fin[6].r) >> 3;
365  s5_i = (Fin[2].i - Fin[6].i) >> 3;
366  s6_r = (Fin[3].r + Fin[7].r) >> 3;
367  s6_i = (Fin[3].i + Fin[7].i) >> 3;
368  s7_r = (Fin[3].r - Fin[7].r) >> 3;
369  s7_i = (Fin[3].i - Fin[7].i) >> 3;
370 
371  t0_r = s0_r - s4_r;
372  t0_i = s0_i - s4_i;
373  t1_r = s0_r + s4_r;
374  t1_i = s0_i + s4_i;
375  t2_r = s2_r + s6_r;
376  t2_i = s2_i + s6_i;
377  t3_r = s2_r - s6_r;
378  t3_i = s2_i - s6_i;
379  Fout[0].r = t1_r + t2_r;
380  Fout[0].i = t1_i + t2_i;
381  Fout[4].r = t1_r - t2_r;
382  Fout[4].i = t1_i - t2_i;
383  Fout[2].r = t0_r - t3_i;
384  Fout[2].i = t0_i + t3_r;
385  Fout[6].r = t0_r + t3_i;
386  Fout[6].i = t0_i - t3_r;
387 
388  t4_r = (ne10_int32_t) ( ( (ne10_int64_t) (s3_r - s3_i) * TW_81) >> 31);
389  t4_i = (ne10_int32_t) ( ( (ne10_int64_t) (s3_r + s3_i) * TW_81) >> 31);
390  t5_r = (ne10_int32_t) ( ( (ne10_int64_t) (s7_r + s7_i) * TW_81) >> 31);
391  t5_i = - (ne10_int32_t) ( ( (ne10_int64_t) (s7_r - s7_i) * TW_81) >> 31);
392 
393  t0_r = s1_r + s5_i;
394  t0_i = s1_i - s5_r;
395  t1_r = s1_r - s5_i;
396  t1_i = s1_i + s5_r;
397  t2_r = t4_r - t5_r;
398  t2_i = t4_i - t5_i;
399  t3_r = t4_r + t5_r;
400  t3_i = t4_i + t5_i;
401  Fout[1].r = t1_r + t2_r;
402  Fout[1].i = t1_i + t2_i;
403  Fout[5].r = t1_r - t2_r;
404  Fout[5].i = t1_i - t2_i;
405  Fout[3].r = t0_r - t3_i;
406  Fout[3].i = t0_i + t3_r;
407  Fout[7].r = t0_r + t3_i;
408  Fout[7].i = t0_i - t3_r;
409 }
410 
411 static void ne10_fft16_forward_int32_unscaled_neon (ne10_fft_cpx_int32_t * Fout,
412  ne10_fft_cpx_int32_t * Fin,
413  ne10_fft_cpx_int32_t * twiddles)
414 {
415  ne10_fft_cpx_int32_t *tw1, *tw2, *tw3;
416 
417  // the first stage
418  int32_t *p_src0, *p_src4, *p_src8, *p_src12;
419  int32x4x2_t q2_in_0123, q2_in_4567, q2_in_89ab, q2_in_cdef;
420  int32x4_t q_t0_r, q_t0_i, q_t1_r, q_t1_i, q_t2_r, q_t2_i, q_t3_r, q_t3_i;
421  int32x4_t q_out_r048c, q_out_i048c, q_out_r159d, q_out_i159d;
422  int32x4_t q_out_r26ae, q_out_i26ae, q_out_r37bf, q_out_i37bf;
423  p_src0 = (int32_t*) (& (Fin[0]));
424  p_src4 = (int32_t*) (& (Fin[4]));
425  p_src8 = (int32_t*) (& (Fin[8]));
426  p_src12 = (int32_t*) (& (Fin[12]));
427  q2_in_0123 = vld2q_s32 (p_src0);
428  q2_in_4567 = vld2q_s32 (p_src4);
429  q2_in_89ab = vld2q_s32 (p_src8);
430  q2_in_cdef = vld2q_s32 (p_src12);
431 
432  q_t2_r = vsubq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
433  q_t2_i = vsubq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
434  q_t3_r = vaddq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
435  q_t3_i = vaddq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
436 
437  q_t0_r = vaddq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
438  q_t0_i = vaddq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
439  q_t1_r = vsubq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
440  q_t1_i = vsubq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
441 
442  q_out_r26ae = vsubq_s32 (q_t3_r, q_t0_r);
443  q_out_i26ae = vsubq_s32 (q_t3_i, q_t0_i);
444  q_out_r048c = vaddq_s32 (q_t3_r, q_t0_r);
445  q_out_i048c = vaddq_s32 (q_t3_i, q_t0_i);
446  q_out_r159d = vaddq_s32 (q_t2_r, q_t1_i);
447  q_out_i159d = vsubq_s32 (q_t2_i, q_t1_r);
448  q_out_r37bf = vsubq_s32 (q_t2_r, q_t1_i);
449  q_out_i37bf = vaddq_s32 (q_t2_i, q_t1_r);
450 
451  // second stages
452  int32_t *p_dst0, *p_dst1, *p_dst2, *p_dst3;
453  int32_t *p_tw1, *p_tw2, *p_tw3;
454  int32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i;
455  int32x4_t q_s3_r, q_s3_i, q_s4_r, q_s4_i, q_s5_r, q_s5_i;
456  int32x4x2_t q2_tmp_0, q2_tmp_1, q2_tmp_2, q2_tmp_3;
457  int32x4_t q_in_r0123, q_in_r4567, q_in_r89ab, q_in_rcdef;
458  int32x4_t q_in_i0123, q_in_i4567, q_in_i89ab, q_in_icdef;
459  int32x4x2_t q2_tw1, q2_tw2, q2_tw3;
460  int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3, q_tmp4, q_tmp5;
461  int32x4x2_t q2_out_0123, q2_out_4567, q2_out_89ab, q2_out_cdef;
462  tw1 = twiddles;
463  tw2 = twiddles + 4;
464  tw3 = twiddles + 8;
465  p_dst0 = (int32_t*) (&Fout[0]);
466  p_dst1 = (int32_t*) (&Fout[4]);
467  p_dst2 = (int32_t*) (&Fout[8]);
468  p_dst3 = (int32_t*) (&Fout[12]);
469  p_tw1 = (int32_t*) tw1;
470  p_tw2 = (int32_t*) tw2;
471  p_tw3 = (int32_t*) tw3;
472  q2_tmp_0 = vzipq_s32 (q_out_r048c, q_out_r159d);
473  q2_tmp_1 = vzipq_s32 (q_out_i048c, q_out_i159d);
474  q2_tmp_2 = vzipq_s32 (q_out_r26ae, q_out_r37bf);
475  q2_tmp_3 = vzipq_s32 (q_out_i26ae, q_out_i37bf);
476  q_in_r0123 = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[0]), vget_low_s32 (q2_tmp_2.val[0]));
477  q_in_i0123 = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[0]), vget_low_s32 (q2_tmp_3.val[0]));
478  q_in_r4567 = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[0]), vget_high_s32 (q2_tmp_2.val[0]));
479  q_in_i4567 = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[0]), vget_high_s32 (q2_tmp_3.val[0]));
480  q_in_r89ab = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[1]), vget_low_s32 (q2_tmp_2.val[1]));
481  q_in_i89ab = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[1]), vget_low_s32 (q2_tmp_3.val[1]));
482  q_in_rcdef = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[1]), vget_high_s32 (q2_tmp_2.val[1]));
483  q_in_icdef = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[1]), vget_high_s32 (q2_tmp_3.val[1]));
484  q2_tw1 = vld2q_s32 (p_tw1);
485  q2_tw2 = vld2q_s32 (p_tw2);
486  q2_tw3 = vld2q_s32 (p_tw3);
487 
488  q_s0_r = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[0]);
489  q_s0_i = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[1]);
490  q_s1_r = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[0]);
491  q_s1_i = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[1]);
492  q_s2_r = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[0]);
493  q_s2_i = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[1]);
494  q_tmp0 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[1]);
495  q_tmp1 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[0]);
496  q_tmp2 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[1]);
497  q_tmp3 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[0]);
498  q_tmp4 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[1]);
499  q_tmp5 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[0]);
500  q_s0_r = vsubq_s32 (q_s0_r, q_tmp0);
501  q_s0_i = vaddq_s32 (q_s0_i, q_tmp1);
502  q_s1_r = vsubq_s32 (q_s1_r, q_tmp2);
503  q_s1_i = vaddq_s32 (q_s1_i, q_tmp3);
504  q_s2_r = vsubq_s32 (q_s2_r, q_tmp4);
505  q_s2_i = vaddq_s32 (q_s2_i, q_tmp5);
506 
507  q_s5_r = vsubq_s32 (q_in_r0123, q_s1_r);
508  q_s5_i = vsubq_s32 (q_in_i0123, q_s1_i);
509  q2_out_0123.val[0] = vaddq_s32 (q_in_r0123, q_s1_r);
510  q2_out_0123.val[1] = vaddq_s32 (q_in_i0123, q_s1_i);
511 
512  q_s3_r = vaddq_s32 (q_s0_r, q_s2_r);
513  q_s3_i = vaddq_s32 (q_s0_i, q_s2_i);
514  q_s4_r = vsubq_s32 (q_s0_r, q_s2_r);
515  q_s4_i = vsubq_s32 (q_s0_i, q_s2_i);
516 
517  q2_out_89ab.val[0] = vsubq_s32 (q2_out_0123.val[0], q_s3_r);
518  q2_out_89ab.val[1] = vsubq_s32 (q2_out_0123.val[1], q_s3_i);
519  q2_out_0123.val[0] = vaddq_s32 (q2_out_0123.val[0], q_s3_r);
520  q2_out_0123.val[1] = vaddq_s32 (q2_out_0123.val[1], q_s3_i);
521 
522  q2_out_4567.val[0] = vaddq_s32 (q_s5_r, q_s4_i);
523  q2_out_4567.val[1] = vsubq_s32 (q_s5_i, q_s4_r);
524  q2_out_cdef.val[0] = vsubq_s32 (q_s5_r, q_s4_i);
525  q2_out_cdef.val[1] = vaddq_s32 (q_s5_i, q_s4_r);
526 
527  vst2q_s32 (p_dst0, q2_out_0123);
528  vst2q_s32 (p_dst1, q2_out_4567);
529  vst2q_s32 (p_dst2, q2_out_89ab);
530  vst2q_s32 (p_dst3, q2_out_cdef);
531 }
532 
533 static void ne10_fft16_backward_int32_unscaled_neon (ne10_fft_cpx_int32_t * Fout,
534  ne10_fft_cpx_int32_t * Fin,
535  ne10_fft_cpx_int32_t * twiddles)
536 {
537  ne10_fft_cpx_int32_t *tw1, *tw2, *tw3;
538 
539  // the first stage
540  int32_t *p_src0, *p_src4, *p_src8, *p_src12;
541  int32x4x2_t q2_in_0123, q2_in_4567, q2_in_89ab, q2_in_cdef;
542  int32x4_t q_t0_r, q_t0_i, q_t1_r, q_t1_i, q_t2_r, q_t2_i, q_t3_r, q_t3_i;
543  int32x4_t q_out_r048c, q_out_i048c, q_out_r159d, q_out_i159d;
544  int32x4_t q_out_r26ae, q_out_i26ae, q_out_r37bf, q_out_i37bf;
545  p_src0 = (int32_t*) (& (Fin[0]));
546  p_src4 = (int32_t*) (& (Fin[4]));
547  p_src8 = (int32_t*) (& (Fin[8]));
548  p_src12 = (int32_t*) (& (Fin[12]));
549  q2_in_0123 = vld2q_s32 (p_src0);
550  q2_in_4567 = vld2q_s32 (p_src4);
551  q2_in_89ab = vld2q_s32 (p_src8);
552  q2_in_cdef = vld2q_s32 (p_src12);
553 
554  q_t2_r = vsubq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
555  q_t2_i = vsubq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
556  q_t3_r = vaddq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
557  q_t3_i = vaddq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
558 
559  q_t0_r = vaddq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
560  q_t0_i = vaddq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
561  q_t1_r = vsubq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
562  q_t1_i = vsubq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
563 
564  q_out_r26ae = vsubq_s32 (q_t3_r, q_t0_r);
565  q_out_i26ae = vsubq_s32 (q_t3_i, q_t0_i);
566  q_out_r048c = vaddq_s32 (q_t3_r, q_t0_r);
567  q_out_i048c = vaddq_s32 (q_t3_i, q_t0_i);
568  q_out_r159d = vsubq_s32 (q_t2_r, q_t1_i);
569  q_out_i159d = vaddq_s32 (q_t2_i, q_t1_r);
570  q_out_r37bf = vaddq_s32 (q_t2_r, q_t1_i);
571  q_out_i37bf = vsubq_s32 (q_t2_i, q_t1_r);
572 
573  // second stages
574  int32_t *p_dst0, *p_dst1, *p_dst2, *p_dst3;
575  int32_t *p_tw1, *p_tw2, *p_tw3;
576  int32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i;
577  int32x4_t q_s3_r, q_s3_i, q_s4_r, q_s4_i, q_s5_r, q_s5_i;
578  int32x4x2_t q2_tmp_0, q2_tmp_1, q2_tmp_2, q2_tmp_3;
579  int32x4_t q_in_r0123, q_in_r4567, q_in_r89ab, q_in_rcdef;
580  int32x4_t q_in_i0123, q_in_i4567, q_in_i89ab, q_in_icdef;
581  int32x4x2_t q2_tw1, q2_tw2, q2_tw3;
582  int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3, q_tmp4, q_tmp5;
583  int32x4x2_t q2_out_0123, q2_out_4567, q2_out_89ab, q2_out_cdef;
584  tw1 = twiddles;
585  tw2 = twiddles + 4;
586  tw3 = twiddles + 8;
587  p_dst0 = (int32_t*) (&Fout[0]);
588  p_dst1 = (int32_t*) (&Fout[4]);
589  p_dst2 = (int32_t*) (&Fout[8]);
590  p_dst3 = (int32_t*) (&Fout[12]);
591  p_tw1 = (int32_t*) tw1;
592  p_tw2 = (int32_t*) tw2;
593  p_tw3 = (int32_t*) tw3;
594  q2_tmp_0 = vzipq_s32 (q_out_r048c, q_out_r159d);
595  q2_tmp_1 = vzipq_s32 (q_out_i048c, q_out_i159d);
596  q2_tmp_2 = vzipq_s32 (q_out_r26ae, q_out_r37bf);
597  q2_tmp_3 = vzipq_s32 (q_out_i26ae, q_out_i37bf);
598  q_in_r0123 = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[0]), vget_low_s32 (q2_tmp_2.val[0]));
599  q_in_i0123 = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[0]), vget_low_s32 (q2_tmp_3.val[0]));
600  q_in_r4567 = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[0]), vget_high_s32 (q2_tmp_2.val[0]));
601  q_in_i4567 = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[0]), vget_high_s32 (q2_tmp_3.val[0]));
602  q_in_r89ab = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[1]), vget_low_s32 (q2_tmp_2.val[1]));
603  q_in_i89ab = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[1]), vget_low_s32 (q2_tmp_3.val[1]));
604  q_in_rcdef = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[1]), vget_high_s32 (q2_tmp_2.val[1]));
605  q_in_icdef = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[1]), vget_high_s32 (q2_tmp_3.val[1]));
606  q2_tw1 = vld2q_s32 (p_tw1);
607  q2_tw2 = vld2q_s32 (p_tw2);
608  q2_tw3 = vld2q_s32 (p_tw3);
609 
610  q_s0_r = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[0]);
611  q_s0_i = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[0]);
612  q_s1_r = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[0]);
613  q_s1_i = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[0]);
614  q_s2_r = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[0]);
615  q_s2_i = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[0]);
616  q_tmp0 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[1]);
617  q_tmp1 = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[1]);
618  q_tmp2 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[1]);
619  q_tmp3 = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[1]);
620  q_tmp4 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[1]);
621  q_tmp5 = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[1]);
622  q_s0_r = vaddq_s32 (q_s0_r, q_tmp0);
623  q_s0_i = vsubq_s32 (q_s0_i, q_tmp1);
624  q_s1_r = vaddq_s32 (q_s1_r, q_tmp2);
625  q_s1_i = vsubq_s32 (q_s1_i, q_tmp3);
626  q_s2_r = vaddq_s32 (q_s2_r, q_tmp4);
627  q_s2_i = vsubq_s32 (q_s2_i, q_tmp5);
628 
629  q_s5_r = vsubq_s32 (q_in_r0123, q_s1_r);
630  q_s5_i = vsubq_s32 (q_in_i0123, q_s1_i);
631  q2_out_0123.val[0] = vaddq_s32 (q_in_r0123, q_s1_r);
632  q2_out_0123.val[1] = vaddq_s32 (q_in_i0123, q_s1_i);
633 
634  q_s3_r = vaddq_s32 (q_s0_r, q_s2_r);
635  q_s3_i = vaddq_s32 (q_s0_i, q_s2_i);
636  q_s4_r = vsubq_s32 (q_s0_r, q_s2_r);
637  q_s4_i = vsubq_s32 (q_s0_i, q_s2_i);
638 
639  q2_out_89ab.val[0] = vsubq_s32 (q2_out_0123.val[0], q_s3_r);
640  q2_out_89ab.val[1] = vsubq_s32 (q2_out_0123.val[1], q_s3_i);
641  q2_out_0123.val[0] = vaddq_s32 (q2_out_0123.val[0], q_s3_r);
642  q2_out_0123.val[1] = vaddq_s32 (q2_out_0123.val[1], q_s3_i);
643 
644  q2_out_4567.val[0] = vsubq_s32 (q_s5_r, q_s4_i);
645  q2_out_4567.val[1] = vaddq_s32 (q_s5_i, q_s4_r);
646  q2_out_cdef.val[0] = vaddq_s32 (q_s5_r, q_s4_i);
647  q2_out_cdef.val[1] = vsubq_s32 (q_s5_i, q_s4_r);
648 
649  vst2q_s32 (p_dst0, q2_out_0123);
650  vst2q_s32 (p_dst1, q2_out_4567);
651  vst2q_s32 (p_dst2, q2_out_89ab);
652  vst2q_s32 (p_dst3, q2_out_cdef);
653 }
654 
655 static void ne10_fft16_forward_int32_scaled_neon (ne10_fft_cpx_int32_t * Fout,
656  ne10_fft_cpx_int32_t * Fin,
657  ne10_fft_cpx_int32_t * twiddles)
658 {
659  ne10_fft_cpx_int32_t *tw1, *tw2, *tw3;
660 
661  // the first stage
662  int32_t *p_src0, *p_src4, *p_src8, *p_src12;
663  int32x4x2_t q2_in_0123, q2_in_4567, q2_in_89ab, q2_in_cdef;
664  int32x4_t q_t0_r, q_t0_i, q_t1_r, q_t1_i, q_t2_r, q_t2_i, q_t3_r, q_t3_i;
665  int32x4_t q_out_r048c, q_out_i048c, q_out_r159d, q_out_i159d;
666  int32x4_t q_out_r26ae, q_out_i26ae, q_out_r37bf, q_out_i37bf;
667  p_src0 = (int32_t*) (& (Fin[0]));
668  p_src4 = (int32_t*) (& (Fin[4]));
669  p_src8 = (int32_t*) (& (Fin[8]));
670  p_src12 = (int32_t*) (& (Fin[12]));
671  q2_in_0123 = vld2q_s32 (p_src0);
672  q2_in_4567 = vld2q_s32 (p_src4);
673  q2_in_89ab = vld2q_s32 (p_src8);
674  q2_in_cdef = vld2q_s32 (p_src12);
675 
676  q_t2_r = vhsubq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
677  q_t2_i = vhsubq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
678  q_t3_r = vhaddq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
679  q_t3_i = vhaddq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
680 
681  q_t0_r = vhaddq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
682  q_t0_i = vhaddq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
683  q_t1_r = vhsubq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
684  q_t1_i = vhsubq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
685 
686  q_out_r26ae = vhsubq_s32 (q_t3_r, q_t0_r);
687  q_out_i26ae = vhsubq_s32 (q_t3_i, q_t0_i);
688  q_out_r048c = vhaddq_s32 (q_t3_r, q_t0_r);
689  q_out_i048c = vhaddq_s32 (q_t3_i, q_t0_i);
690  q_out_r159d = vhaddq_s32 (q_t2_r, q_t1_i);
691  q_out_i159d = vhsubq_s32 (q_t2_i, q_t1_r);
692  q_out_r37bf = vhsubq_s32 (q_t2_r, q_t1_i);
693  q_out_i37bf = vhaddq_s32 (q_t2_i, q_t1_r);
694 
695 
696  // second stages
697  int32_t *p_dst0, *p_dst1, *p_dst2, *p_dst3;
698  int32_t *p_tw1, *p_tw2, *p_tw3;
699  int32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i;
700  int32x4_t q_s3_r, q_s3_i, q_s4_r, q_s4_i, q_s5_r, q_s5_i;
701  int32x4x2_t q2_tmp_0, q2_tmp_1, q2_tmp_2, q2_tmp_3;
702  int32x4_t q_in_r0123, q_in_r4567, q_in_r89ab, q_in_rcdef;
703  int32x4_t q_in_i0123, q_in_i4567, q_in_i89ab, q_in_icdef;
704  int32x4x2_t q2_tw1, q2_tw2, q2_tw3;
705  int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3, q_tmp4, q_tmp5;
706  int32x4x2_t q2_out_0123, q2_out_4567, q2_out_89ab, q2_out_cdef;
707  tw1 = twiddles;
708  tw2 = twiddles + 4;
709  tw3 = twiddles + 8;
710  p_dst0 = (int32_t*) (&Fout[0]);
711  p_dst1 = (int32_t*) (&Fout[4]);
712  p_dst2 = (int32_t*) (&Fout[8]);
713  p_dst3 = (int32_t*) (&Fout[12]);
714  p_tw1 = (int32_t*) tw1;
715  p_tw2 = (int32_t*) tw2;
716  p_tw3 = (int32_t*) tw3;
717  q2_tmp_0 = vzipq_s32 (q_out_r048c, q_out_r159d);
718  q2_tmp_1 = vzipq_s32 (q_out_i048c, q_out_i159d);
719  q2_tmp_2 = vzipq_s32 (q_out_r26ae, q_out_r37bf);
720  q2_tmp_3 = vzipq_s32 (q_out_i26ae, q_out_i37bf);
721  q_in_r0123 = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[0]), vget_low_s32 (q2_tmp_2.val[0]));
722  q_in_i0123 = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[0]), vget_low_s32 (q2_tmp_3.val[0]));
723  q_in_r4567 = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[0]), vget_high_s32 (q2_tmp_2.val[0]));
724  q_in_i4567 = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[0]), vget_high_s32 (q2_tmp_3.val[0]));
725  q_in_r89ab = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[1]), vget_low_s32 (q2_tmp_2.val[1]));
726  q_in_i89ab = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[1]), vget_low_s32 (q2_tmp_3.val[1]));
727  q_in_rcdef = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[1]), vget_high_s32 (q2_tmp_2.val[1]));
728  q_in_icdef = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[1]), vget_high_s32 (q2_tmp_3.val[1]));
729  q2_tw1 = vld2q_s32 (p_tw1);
730  q2_tw2 = vld2q_s32 (p_tw2);
731  q2_tw3 = vld2q_s32 (p_tw3);
732 
733  q_s0_r = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[0]);
734  q_s0_i = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[1]);
735  q_s1_r = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[0]);
736  q_s1_i = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[1]);
737  q_s2_r = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[0]);
738  q_s2_i = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[1]);
739  q_tmp0 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[1]);
740  q_tmp1 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[0]);
741  q_tmp2 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[1]);
742  q_tmp3 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[0]);
743  q_tmp4 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[1]);
744  q_tmp5 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[0]);
745 
746  q_s0_r = vsubq_s32 (q_s0_r, q_tmp0);
747  q_s0_i = vaddq_s32 (q_s0_i, q_tmp1);
748  q_s1_r = vsubq_s32 (q_s1_r, q_tmp2);
749  q_s1_i = vaddq_s32 (q_s1_i, q_tmp3);
750  q_s2_r = vsubq_s32 (q_s2_r, q_tmp4);
751  q_s2_i = vaddq_s32 (q_s2_i, q_tmp5);
752 
753  q_s5_r = vhsubq_s32 (q_in_r0123, q_s1_r);
754  q_s5_i = vhsubq_s32 (q_in_i0123, q_s1_i);
755  q2_out_0123.val[0] = vhaddq_s32 (q_in_r0123, q_s1_r);
756  q2_out_0123.val[1] = vhaddq_s32 (q_in_i0123, q_s1_i);
757 
758  q_s3_r = vhaddq_s32 (q_s0_r, q_s2_r);
759  q_s3_i = vhaddq_s32 (q_s0_i, q_s2_i);
760  q_s4_r = vhsubq_s32 (q_s0_r, q_s2_r);
761  q_s4_i = vhsubq_s32 (q_s0_i, q_s2_i);
762 
763  q2_out_89ab.val[0] = vhsubq_s32 (q2_out_0123.val[0], q_s3_r);
764  q2_out_89ab.val[1] = vhsubq_s32 (q2_out_0123.val[1], q_s3_i);
765  q2_out_0123.val[0] = vhaddq_s32 (q2_out_0123.val[0], q_s3_r);
766  q2_out_0123.val[1] = vhaddq_s32 (q2_out_0123.val[1], q_s3_i);
767 
768  q2_out_4567.val[0] = vhaddq_s32 (q_s5_r, q_s4_i);
769  q2_out_4567.val[1] = vhsubq_s32 (q_s5_i, q_s4_r);
770  q2_out_cdef.val[0] = vhsubq_s32 (q_s5_r, q_s4_i);
771  q2_out_cdef.val[1] = vhaddq_s32 (q_s5_i, q_s4_r);
772 
773  vst2q_s32 (p_dst0, q2_out_0123);
774  vst2q_s32 (p_dst1, q2_out_4567);
775  vst2q_s32 (p_dst2, q2_out_89ab);
776  vst2q_s32 (p_dst3, q2_out_cdef);
777 }
778 
779 static void ne10_fft16_backward_int32_scaled_neon (ne10_fft_cpx_int32_t * Fout,
780  ne10_fft_cpx_int32_t * Fin,
781  ne10_fft_cpx_int32_t * twiddles)
782 {
783  ne10_fft_cpx_int32_t *tw1, *tw2, *tw3;
784 
785  // the first stage
786  int32_t *p_src0, *p_src4, *p_src8, *p_src12;
787  int32x4x2_t q2_in_0123, q2_in_4567, q2_in_89ab, q2_in_cdef;
788  int32x4_t q_t0_r, q_t0_i, q_t1_r, q_t1_i, q_t2_r, q_t2_i, q_t3_r, q_t3_i;
789  int32x4_t q_out_r048c, q_out_i048c, q_out_r159d, q_out_i159d;
790  int32x4_t q_out_r26ae, q_out_i26ae, q_out_r37bf, q_out_i37bf;
791  p_src0 = (int32_t*) (& (Fin[0]));
792  p_src4 = (int32_t*) (& (Fin[4]));
793  p_src8 = (int32_t*) (& (Fin[8]));
794  p_src12 = (int32_t*) (& (Fin[12]));
795  q2_in_0123 = vld2q_s32 (p_src0);
796  q2_in_4567 = vld2q_s32 (p_src4);
797  q2_in_89ab = vld2q_s32 (p_src8);
798  q2_in_cdef = vld2q_s32 (p_src12);
799 
800  q_t2_r = vhsubq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
801  q_t2_i = vhsubq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
802  q_t3_r = vhaddq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
803  q_t3_i = vhaddq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
804 
805  q_t0_r = vhaddq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
806  q_t0_i = vhaddq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
807  q_t1_r = vhsubq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
808  q_t1_i = vhsubq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
809 
810  q_out_r26ae = vhsubq_s32 (q_t3_r, q_t0_r);
811  q_out_i26ae = vhsubq_s32 (q_t3_i, q_t0_i);
812  q_out_r048c = vhaddq_s32 (q_t3_r, q_t0_r);
813  q_out_i048c = vhaddq_s32 (q_t3_i, q_t0_i);
814  q_out_r159d = vhsubq_s32 (q_t2_r, q_t1_i);
815  q_out_i159d = vhaddq_s32 (q_t2_i, q_t1_r);
816  q_out_r37bf = vhaddq_s32 (q_t2_r, q_t1_i);
817  q_out_i37bf = vhsubq_s32 (q_t2_i, q_t1_r);
818 
819  // second stages
820  int32_t *p_dst0, *p_dst1, *p_dst2, *p_dst3;
821  int32_t *p_tw1, *p_tw2, *p_tw3;
822  int32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i;
823  int32x4_t q_s3_r, q_s3_i, q_s4_r, q_s4_i, q_s5_r, q_s5_i;
824  int32x4x2_t q2_tmp_0, q2_tmp_1, q2_tmp_2, q2_tmp_3;
825  int32x4_t q_in_r0123, q_in_r4567, q_in_r89ab, q_in_rcdef;
826  int32x4_t q_in_i0123, q_in_i4567, q_in_i89ab, q_in_icdef;
827  int32x4x2_t q2_tw1, q2_tw2, q2_tw3;
828  int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3, q_tmp4, q_tmp5;
829  int32x4x2_t q2_out_0123, q2_out_4567, q2_out_89ab, q2_out_cdef;
830  tw1 = twiddles;
831  tw2 = twiddles + 4;
832  tw3 = twiddles + 8;
833  p_dst0 = (int32_t*) (&Fout[0]);
834  p_dst1 = (int32_t*) (&Fout[4]);
835  p_dst2 = (int32_t*) (&Fout[8]);
836  p_dst3 = (int32_t*) (&Fout[12]);
837  p_tw1 = (int32_t*) tw1;
838  p_tw2 = (int32_t*) tw2;
839  p_tw3 = (int32_t*) tw3;
840  q2_tmp_0 = vzipq_s32 (q_out_r048c, q_out_r159d);
841  q2_tmp_1 = vzipq_s32 (q_out_i048c, q_out_i159d);
842  q2_tmp_2 = vzipq_s32 (q_out_r26ae, q_out_r37bf);
843  q2_tmp_3 = vzipq_s32 (q_out_i26ae, q_out_i37bf);
844  q_in_r0123 = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[0]), vget_low_s32 (q2_tmp_2.val[0]));
845  q_in_i0123 = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[0]), vget_low_s32 (q2_tmp_3.val[0]));
846  q_in_r4567 = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[0]), vget_high_s32 (q2_tmp_2.val[0]));
847  q_in_i4567 = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[0]), vget_high_s32 (q2_tmp_3.val[0]));
848  q_in_r89ab = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[1]), vget_low_s32 (q2_tmp_2.val[1]));
849  q_in_i89ab = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[1]), vget_low_s32 (q2_tmp_3.val[1]));
850  q_in_rcdef = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[1]), vget_high_s32 (q2_tmp_2.val[1]));
851  q_in_icdef = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[1]), vget_high_s32 (q2_tmp_3.val[1]));
852  q2_tw1 = vld2q_s32 (p_tw1);
853  q2_tw2 = vld2q_s32 (p_tw2);
854  q2_tw3 = vld2q_s32 (p_tw3);
855 
856  q_s0_r = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[0]);
857  q_s0_i = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[0]);
858  q_s1_r = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[0]);
859  q_s1_i = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[0]);
860  q_s2_r = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[0]);
861  q_s2_i = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[0]);
862  q_tmp0 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[1]);
863  q_tmp1 = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[1]);
864  q_tmp2 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[1]);
865  q_tmp3 = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[1]);
866  q_tmp4 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[1]);
867  q_tmp5 = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[1]);
868  q_s0_r = vaddq_s32 (q_s0_r, q_tmp0);
869  q_s0_i = vsubq_s32 (q_s0_i, q_tmp1);
870  q_s1_r = vaddq_s32 (q_s1_r, q_tmp2);
871  q_s1_i = vsubq_s32 (q_s1_i, q_tmp3);
872  q_s2_r = vaddq_s32 (q_s2_r, q_tmp4);
873  q_s2_i = vsubq_s32 (q_s2_i, q_tmp5);
874 
875  q_s5_r = vhsubq_s32 (q_in_r0123, q_s1_r);
876  q_s5_i = vhsubq_s32 (q_in_i0123, q_s1_i);
877  q2_out_0123.val[0] = vhaddq_s32 (q_in_r0123, q_s1_r);
878  q2_out_0123.val[1] = vhaddq_s32 (q_in_i0123, q_s1_i);
879 
880  q_s3_r = vhaddq_s32 (q_s0_r, q_s2_r);
881  q_s3_i = vhaddq_s32 (q_s0_i, q_s2_i);
882  q_s4_r = vhsubq_s32 (q_s0_r, q_s2_r);
883  q_s4_i = vhsubq_s32 (q_s0_i, q_s2_i);
884 
885  q2_out_89ab.val[0] = vhsubq_s32 (q2_out_0123.val[0], q_s3_r);
886  q2_out_89ab.val[1] = vhsubq_s32 (q2_out_0123.val[1], q_s3_i);
887  q2_out_0123.val[0] = vhaddq_s32 (q2_out_0123.val[0], q_s3_r);
888  q2_out_0123.val[1] = vhaddq_s32 (q2_out_0123.val[1], q_s3_i);
889 
890  q2_out_4567.val[0] = vhsubq_s32 (q_s5_r, q_s4_i);
891  q2_out_4567.val[1] = vhaddq_s32 (q_s5_i, q_s4_r);
892  q2_out_cdef.val[0] = vhaddq_s32 (q_s5_r, q_s4_i);
893  q2_out_cdef.val[1] = vhsubq_s32 (q_s5_i, q_s4_r);
894 
895  vst2q_s32 (p_dst0, q2_out_0123);
896  vst2q_s32 (p_dst1, q2_out_4567);
897  vst2q_s32 (p_dst2, q2_out_89ab);
898  vst2q_s32 (p_dst3, q2_out_cdef);
899 }
900 
901 static void ne10_fft_split_r2c_1d_int32_neon (ne10_fft_cpx_int32_t *dst,
902  const ne10_fft_cpx_int32_t *src,
903  ne10_fft_cpx_int32_t *twiddles,
904  ne10_int32_t ncfft,
905  ne10_int32_t scaled_flag)
906 {
907  ne10_int32_t k;
908  ne10_int32_t count = ncfft / 2;
909  ne10_fft_cpx_int32_t fpnk, fpk, f1k, f2k, tw, tdc;
910  int32x4x2_t q2_fpk, q2_fpnk, q2_tw, q2_dst, q2_dst2;
911  int32x4_t q_fpnk_r, q_fpnk_i;
912  int32x4_t q_f1k_r, q_f1k_i, q_f2k_r, q_f2k_i;
913  int32x4_t q_tw_r, q_tw_i;
914  int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
915  int32x4_t q_dst2_r, q_dst2_i;
916  int32_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
917 
918  tdc.r = src[0].r;
919  tdc.i = src[0].i;
920 
921  if (scaled_flag)
922  NE10_F2I32_FIXDIV (tdc, 2);
923 
924  dst[0].r = tdc.r + tdc.i;
925  dst[ncfft].r = tdc.r - tdc.i;
926  dst[ncfft].i = dst[0].i = 0;
927  if (count >= 4)
928  {
929 
930  if (scaled_flag)
931  {
932  for (k = 1; k <= count ; k += 4)
933  {
934  p_src = (int32_t*) (& (src[k]));
935  p_src2 = (int32_t*) (& (src[ncfft - k - 3]));
936  p_twiddles = (int32_t*) (& (twiddles[k - 1]));
937  p_dst = (int32_t*) (& (dst[k]));
938  p_dst2 = (int32_t*) (& (dst[ncfft - k - 3]));
939 
940  q2_fpk = vld2q_s32 (p_src);
941  q2_fpnk = vld2q_s32 (p_src2);
942 
943  q2_tw = vld2q_s32 (p_twiddles);
944  q2_fpnk.val[0] = vrev64q_s32 (q2_fpnk.val[0]);
945  q2_fpnk.val[1] = vrev64q_s32 (q2_fpnk.val[1]);
946  q_fpnk_r = vcombine_s32 (vget_high_s32 (q2_fpnk.val[0]), vget_low_s32 (q2_fpnk.val[0]));
947  q_fpnk_i = vcombine_s32 (vget_high_s32 (q2_fpnk.val[1]), vget_low_s32 (q2_fpnk.val[1]));
948  q_fpnk_i = vnegq_s32 (q_fpnk_i);
949 
950  q_f1k_r = vhaddq_s32 (q2_fpk.val[0], q_fpnk_r);
951  q_f1k_i = vhaddq_s32 (q2_fpk.val[1], q_fpnk_i);
952 
953  q_f2k_r = vhsubq_s32 (q2_fpk.val[0], q_fpnk_r);
954  q_f2k_i = vhsubq_s32 (q2_fpk.val[1], q_fpnk_i);
955 
956  q_tmp0 = vqdmulhq_s32 (q_f2k_r, q2_tw.val[0]);
957  q_tmp1 = vqdmulhq_s32 (q_f2k_i, q2_tw.val[1]);
958  q_tmp2 = vqdmulhq_s32 (q_f2k_r, q2_tw.val[1]);
959  q_tmp3 = vqdmulhq_s32 (q_f2k_i, q2_tw.val[0]);
960  q_tw_r = vsubq_s32 (q_tmp0, q_tmp1);
961  q_tw_i = vaddq_s32 (q_tmp2, q_tmp3);
962 
963  q_dst2_r = vhsubq_s32 (q_f1k_r, q_tw_r);
964  q_dst2_i = vhsubq_s32 (q_tw_i, q_f1k_i);
965  q2_dst.val[0] = vhaddq_s32 (q_f1k_r, q_tw_r);
966  q2_dst.val[1] = vhaddq_s32 (q_f1k_i, q_tw_i);
967  q_dst2_r = vrev64q_s32 (q_dst2_r);
968  q_dst2_i = vrev64q_s32 (q_dst2_i);
969  q2_dst2.val[0] = vcombine_s32 (vget_high_s32 (q_dst2_r), vget_low_s32 (q_dst2_r));
970  q2_dst2.val[1] = vcombine_s32 (vget_high_s32 (q_dst2_i), vget_low_s32 (q_dst2_i));
971  vst2q_s32 (p_dst, q2_dst);
972  vst2q_s32 (p_dst2, q2_dst2);
973 
974  }
975  }
976  else
977  {
978  for (k = 1; k <= count ; k += 4)
979  {
980  p_src = (int32_t*) (& (src[k]));
981  p_src2 = (int32_t*) (& (src[ncfft - k - 3]));
982  p_twiddles = (int32_t*) (& (twiddles[k - 1]));
983  p_dst = (int32_t*) (& (dst[k]));
984  p_dst2 = (int32_t*) (& (dst[ncfft - k - 3]));
985 
986  q2_fpk = vld2q_s32 (p_src);
987  q2_fpnk = vld2q_s32 (p_src2);
988 
989  q2_tw = vld2q_s32 (p_twiddles);
990  q2_fpnk.val[0] = vrev64q_s32 (q2_fpnk.val[0]);
991  q2_fpnk.val[1] = vrev64q_s32 (q2_fpnk.val[1]);
992  q_fpnk_r = vcombine_s32 (vget_high_s32 (q2_fpnk.val[0]), vget_low_s32 (q2_fpnk.val[0]));
993  q_fpnk_i = vcombine_s32 (vget_high_s32 (q2_fpnk.val[1]), vget_low_s32 (q2_fpnk.val[1]));
994  q_fpnk_i = vnegq_s32 (q_fpnk_i);
995 
996  q_f1k_r = vaddq_s32 (q2_fpk.val[0], q_fpnk_r);
997  q_f1k_i = vaddq_s32 (q2_fpk.val[1], q_fpnk_i);
998 
999  q_f2k_r = vsubq_s32 (q2_fpk.val[0], q_fpnk_r);
1000  q_f2k_i = vsubq_s32 (q2_fpk.val[1], q_fpnk_i);
1001 
1002  q_tmp0 = vqdmulhq_s32 (q_f2k_r, q2_tw.val[0]);
1003  q_tmp1 = vqdmulhq_s32 (q_f2k_i, q2_tw.val[1]);
1004  q_tmp2 = vqdmulhq_s32 (q_f2k_r, q2_tw.val[1]);
1005  q_tmp3 = vqdmulhq_s32 (q_f2k_i, q2_tw.val[0]);
1006  q_tw_r = vsubq_s32 (q_tmp0, q_tmp1);
1007  q_tw_i = vaddq_s32 (q_tmp2, q_tmp3);
1008 
1009  q_dst2_r = vhsubq_s32 (q_f1k_r, q_tw_r);
1010  q_dst2_i = vhsubq_s32 (q_tw_i, q_f1k_i);
1011  q2_dst.val[0] = vhaddq_s32 (q_f1k_r, q_tw_r);
1012  q2_dst.val[1] = vhaddq_s32 (q_f1k_i, q_tw_i);
1013  q_dst2_r = vrev64q_s32 (q_dst2_r);
1014  q_dst2_i = vrev64q_s32 (q_dst2_i);
1015  q2_dst2.val[0] = vcombine_s32 (vget_high_s32 (q_dst2_r), vget_low_s32 (q_dst2_r));
1016  q2_dst2.val[1] = vcombine_s32 (vget_high_s32 (q_dst2_i), vget_low_s32 (q_dst2_i));
1017  vst2q_s32 (p_dst, q2_dst);
1018  vst2q_s32 (p_dst2, q2_dst2);
1019 
1020  }
1021  }
1022  }
1023  else
1024  {
1025 
1026  for (k = 1; k <= ncfft / 2 ; ++k)
1027  {
1028  fpk = src[k];
1029  fpnk.r = src[ncfft - k].r;
1030  fpnk.i = - src[ncfft - k].i;
1031  if (scaled_flag)
1032  {
1033  NE10_F2I32_FIXDIV (fpk, 2);
1034  NE10_F2I32_FIXDIV (fpnk, 2);
1035  }
1036 
1037  f1k.r = fpk.r + fpnk.r;
1038  f1k.i = fpk.i + fpnk.i;
1039 
1040  f2k.r = fpk.r - fpnk.r;
1041  f2k.i = fpk.i - fpnk.i;
1042 
1043  tw.r = ( ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) f2k.r * (twiddles[k - 1]).r) >> 32)) - ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) f2k.i * (twiddles[k - 1]).i) >> 32))) << 1;
1044  tw.i = ( ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) f2k.r * (twiddles[k - 1]).i) >> 32)) + ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) f2k.i * (twiddles[k - 1]).r) >> 32))) << 1;
1045 
1046  dst[k].r = (f1k.r + tw.r) >> 1;
1047  dst[k].i = (f1k.i + tw.i) >> 1;
1048  dst[ncfft - k].r = (f1k.r - tw.r) >> 1;
1049  dst[ncfft - k].i = (tw.i - f1k.i) >> 1;
1050  }
1051  }
1052 }
1053 
1054 static void ne10_fft_split_c2r_1d_int32_neon (ne10_fft_cpx_int32_t *dst,
1055  const ne10_fft_cpx_int32_t *src,
1056  ne10_fft_cpx_int32_t *twiddles,
1057  ne10_int32_t ncfft,
1058  ne10_int32_t scaled_flag)
1059 {
1060 
1061  ne10_int32_t k;
1062  ne10_int32_t count = ncfft / 2;
1063  ne10_fft_cpx_int32_t fk, fnkc, fek, fok, tmp;
1064  int32x4x2_t q2_fk, q2_fnkc, q2_tw, q2_dst, q2_dst2;
1065  int32x4_t q_fnkc_r, q_fnkc_i;
1066  int32x4_t q_fek_r, q_fek_i, q_fok_r, q_fok_i;
1067  int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
1068  int32x4_t q_dst2_r, q_dst2_i;
1069  int32_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
1070 
1071 
1072  dst[0].r = src[0].r + src[ncfft].r;
1073  dst[0].i = src[0].r - src[ncfft].r;
1074  if (scaled_flag)
1075  NE10_F2I32_FIXDIV (dst[0], 2);
1076  if (count >= 4)
1077  {
1078  if (scaled_flag)
1079  {
1080  for (k = 1; k <= count ; k += 4)
1081  {
1082  p_src = (int32_t*) (& (src[k]));
1083  p_src2 = (int32_t*) (& (src[ncfft - k - 3]));
1084  p_twiddles = (int32_t*) (& (twiddles[k - 1]));
1085  p_dst = (int32_t*) (& (dst[k]));
1086  p_dst2 = (int32_t*) (& (dst[ncfft - k - 3]));
1087 
1088  q2_fk = vld2q_s32 (p_src);
1089  q2_fnkc = vld2q_s32 (p_src2);
1090  q2_tw = vld2q_s32 (p_twiddles);
1091  q2_fnkc.val[0] = vrev64q_s32 (q2_fnkc.val[0]);
1092  q2_fnkc.val[1] = vrev64q_s32 (q2_fnkc.val[1]);
1093  q_fnkc_r = vcombine_s32 (vget_high_s32 (q2_fnkc.val[0]), vget_low_s32 (q2_fnkc.val[0]));
1094  q_fnkc_i = vcombine_s32 (vget_high_s32 (q2_fnkc.val[1]), vget_low_s32 (q2_fnkc.val[1]));
1095  q_fnkc_i = vnegq_s32 (q_fnkc_i);
1096 
1097  q_fek_r = vhaddq_s32 (q2_fk.val[0], q_fnkc_r);
1098  q_fek_i = vhaddq_s32 (q2_fk.val[1], q_fnkc_i);
1099  q_tmp0 = vhsubq_s32 (q2_fk.val[0], q_fnkc_r);
1100  q_tmp1 = vhsubq_s32 (q2_fk.val[1], q_fnkc_i);
1101 
1102  q_fok_r = vqdmulhq_s32 (q_tmp0, q2_tw.val[0]);
1103  q_fok_i = vqdmulhq_s32 (q_tmp1, q2_tw.val[0]);
1104  q_tmp2 = vqdmulhq_s32 (q_tmp1, q2_tw.val[1]);
1105  q_tmp3 = vqdmulhq_s32 (q_tmp0, q2_tw.val[1]);
1106  q_fok_r = vaddq_s32 (q_fok_r, q_tmp2);
1107  q_fok_i = vsubq_s32 (q_fok_i, q_tmp3);
1108 
1109  q_dst2_r = vsubq_s32 (q_fek_r, q_fok_r);
1110  q_dst2_i = vsubq_s32 (q_fok_i, q_fek_i);
1111  q2_dst.val[0] = vaddq_s32 (q_fek_r, q_fok_r);
1112  q2_dst.val[1] = vaddq_s32 (q_fek_i, q_fok_i);
1113  q_dst2_r = vrev64q_s32 (q_dst2_r);
1114  q_dst2_i = vrev64q_s32 (q_dst2_i);
1115  q2_dst2.val[0] = vcombine_s32 (vget_high_s32 (q_dst2_r), vget_low_s32 (q_dst2_r));
1116  q2_dst2.val[1] = vcombine_s32 (vget_high_s32 (q_dst2_i), vget_low_s32 (q_dst2_i));
1117  vst2q_s32 (p_dst, q2_dst);
1118  vst2q_s32 (p_dst2, q2_dst2);
1119 
1120  }
1121 
1122  }
1123  else
1124  {
1125  for (k = 1; k <= count ; k += 4)
1126  {
1127  p_src = (int32_t*) (& (src[k]));
1128  p_src2 = (int32_t*) (& (src[ncfft - k - 3]));
1129  p_twiddles = (int32_t*) (& (twiddles[k - 1]));
1130  p_dst = (int32_t*) (& (dst[k]));
1131  p_dst2 = (int32_t*) (& (dst[ncfft - k - 3]));
1132 
1133  q2_fk = vld2q_s32 (p_src);
1134  q2_fnkc = vld2q_s32 (p_src2);
1135  q2_tw = vld2q_s32 (p_twiddles);
1136  q2_fnkc.val[0] = vrev64q_s32 (q2_fnkc.val[0]);
1137  q2_fnkc.val[1] = vrev64q_s32 (q2_fnkc.val[1]);
1138  q_fnkc_r = vcombine_s32 (vget_high_s32 (q2_fnkc.val[0]), vget_low_s32 (q2_fnkc.val[0]));
1139  q_fnkc_i = vcombine_s32 (vget_high_s32 (q2_fnkc.val[1]), vget_low_s32 (q2_fnkc.val[1]));
1140  q_fnkc_i = vnegq_s32 (q_fnkc_i);
1141 
1142  q_fek_r = vaddq_s32 (q2_fk.val[0], q_fnkc_r);
1143  q_fek_i = vaddq_s32 (q2_fk.val[1], q_fnkc_i);
1144  q_tmp0 = vsubq_s32 (q2_fk.val[0], q_fnkc_r);
1145  q_tmp1 = vsubq_s32 (q2_fk.val[1], q_fnkc_i);
1146 
1147  q_fok_r = vqdmulhq_s32 (q_tmp0, q2_tw.val[0]);
1148  q_fok_i = vqdmulhq_s32 (q_tmp1, q2_tw.val[0]);
1149  q_tmp2 = vqdmulhq_s32 (q_tmp1, q2_tw.val[1]);
1150  q_tmp3 = vqdmulhq_s32 (q_tmp0, q2_tw.val[1]);
1151  q_fok_r = vaddq_s32 (q_fok_r, q_tmp2);
1152  q_fok_i = vsubq_s32 (q_fok_i, q_tmp3);
1153 
1154  q_dst2_r = vsubq_s32 (q_fek_r, q_fok_r);
1155  q_dst2_i = vsubq_s32 (q_fok_i, q_fek_i);
1156  q2_dst.val[0] = vaddq_s32 (q_fek_r, q_fok_r);
1157  q2_dst.val[1] = vaddq_s32 (q_fek_i, q_fok_i);
1158  q_dst2_r = vrev64q_s32 (q_dst2_r);
1159  q_dst2_i = vrev64q_s32 (q_dst2_i);
1160  q2_dst2.val[0] = vcombine_s32 (vget_high_s32 (q_dst2_r), vget_low_s32 (q_dst2_r));
1161  q2_dst2.val[1] = vcombine_s32 (vget_high_s32 (q_dst2_i), vget_low_s32 (q_dst2_i));
1162  vst2q_s32 (p_dst, q2_dst);
1163  vst2q_s32 (p_dst2, q2_dst2);
1164 
1165  }
1166  }
1167  }
1168  else
1169  {
1170 
1171  for (k = 1; k <= ncfft / 2; k++)
1172  {
1173  fk = src[k];
1174  fnkc.r = src[ncfft - k].r;
1175  fnkc.i = -src[ncfft - k].i;
1176  if (scaled_flag)
1177  {
1178  NE10_F2I32_FIXDIV (fk, 2);
1179  NE10_F2I32_FIXDIV (fnkc, 2);
1180  }
1181 
1182  fek.r = fk.r + fnkc.r;
1183  fek.i = fk.i + fnkc.i;
1184 
1185  tmp.r = fk.r - fnkc.r;
1186  tmp.i = fk.i - fnkc.i;
1187 
1188  fok.r = ( ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) tmp.r * (twiddles[k - 1]).r) >> 32)) + ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) tmp.i * (twiddles[k - 1]).i) >> 32))) << 1;
1189  fok.i = ( ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) tmp.i * (twiddles[k - 1]).r) >> 32)) - ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) tmp.r * (twiddles[k - 1]).i) >> 32))) << 1;
1190 
1191  dst[k].r = fek.r + fok.r;
1192  dst[k].i = fek.i + fok.i;
1193 
1194  dst[ncfft - k].r = fek.r - fok.r;
1195  dst[ncfft - k].i = fok.i - fek.i;
1196  }
1197  }
1198 }
1199 
1205  ne10_fft_cpx_int32_t *fin,
1207  ne10_int32_t inverse_fft,
1208  ne10_int32_t scaled_flag)
1209 {
1210  // For input shorter than 15, fall back to c version.
1211  // We would not get much improvement from NEON for these cases.
1212  if (cfg->nfft < 15)
1213  {
1214  ne10_fft_c2c_1d_int32_c (fout, fin, cfg, inverse_fft, scaled_flag);
1215  return;
1216  }
1217 
1218  ne10_int32_t stage_count = cfg->factors[0];
1219  ne10_int32_t algorithm_flag = cfg->factors[2 * (stage_count + 1)];
1220 
1221  assert ((algorithm_flag == NE10_FFT_ALG_DEFAULT)
1222  || (algorithm_flag == NE10_FFT_ALG_ANY));
1223 
1224  // For NE10_FFT_ALG_ANY.
1225  // Function will return inside this branch.
1226  if (algorithm_flag == NE10_FFT_ALG_ANY)
1227  {
1228  if (inverse_fft)
1229  {
1231  cfg->factors, cfg->twiddles, cfg->buffer, scaled_flag);
1232  }
1233  else
1234  {
1236  cfg->factors, cfg->twiddles, cfg->buffer, scaled_flag);
1237  }
1238  return;
1239  }
1240 
1241  if (scaled_flag)
1242  {
1243  if (inverse_fft)
1244  {
1245  switch (cfg->nfft)
1246  {
1247  case 4:
1248  ne10_fft4_backward_int32_scaled (fout, fin);
1249  break;
1250  case 8:
1251  ne10_fft8_backward_int32_scaled (fout, fin);
1252  break;
1253  case 16:
1254  ne10_fft16_backward_int32_scaled_neon (fout, fin, cfg->twiddles);
1255  break;
1256  default:
1258  break;
1259  }
1260  }
1261  else
1262  {
1263  switch (cfg->nfft)
1264  {
1265  case 4:
1266  ne10_fft4_forward_int32_scaled (fout, fin);
1267  break;
1268  case 8:
1269  ne10_fft8_forward_int32_scaled (fout, fin);
1270  break;
1271  case 16:
1272  ne10_fft16_forward_int32_scaled_neon (fout, fin, cfg->twiddles);
1273  break;
1274  default:
1276  break;
1277  }
1278  }
1279  }
1280  else
1281  {
1282  if (inverse_fft)
1283  {
1284  switch (cfg->nfft)
1285  {
1286  case 4:
1287  ne10_fft4_backward_int32_unscaled (fout, fin);
1288  break;
1289  case 8:
1290  ne10_fft8_backward_int32_unscaled (fout, fin);
1291  break;
1292  case 16:
1293  ne10_fft16_backward_int32_unscaled_neon (fout, fin, cfg->twiddles);
1294  break;
1295  default:
1297  break;
1298  }
1299  }
1300  else
1301  {
1302  switch (cfg->nfft)
1303  {
1304  case 4:
1305  ne10_fft4_forward_int32_unscaled (fout, fin);
1306  break;
1307  case 8:
1308  ne10_fft8_forward_int32_unscaled (fout, fin);
1309  break;
1310  case 16:
1311  ne10_fft16_forward_int32_unscaled_neon (fout, fin, cfg->twiddles);
1312  break;
1313  default:
1315  break;
1316  }
1317  }
1318  }
1319 }
1320 
1326  ne10_int32_t *fin,
1328  ne10_int32_t scaled_flag)
1329 {
1330  ne10_fft_cpx_int32_t * tmpbuf1 = cfg->buffer;
1331  ne10_fft_cpx_int32_t * tmpbuf2 = cfg->buffer + cfg->ncfft;
1332  ne10_fft_state_int32_t c2c_state;
1333 
1334  c2c_state.nfft = cfg->ncfft;
1335  c2c_state.factors = cfg->factors;
1336  c2c_state.twiddles = cfg->twiddles;
1337  c2c_state.buffer = tmpbuf2;
1338 
1339  ne10_fft_c2c_1d_int32_neon (tmpbuf1, (ne10_fft_cpx_int32_t*) fin, &c2c_state, 0, scaled_flag);
1340  ne10_fft_split_r2c_1d_int32_neon (fout, tmpbuf1, cfg->super_twiddles, cfg->ncfft, scaled_flag);
1341 }
1342 
1348  ne10_fft_cpx_int32_t *fin,
1350  ne10_int32_t scaled_flag)
1351 
1352 {
1353  ne10_fft_cpx_int32_t * tmpbuf1 = cfg->buffer;
1354  ne10_fft_cpx_int32_t * tmpbuf2 = cfg->buffer + cfg->ncfft;
1355  ne10_fft_state_int32_t c2c_state;
1356 
1357  c2c_state.nfft = cfg->ncfft;
1358  c2c_state.factors = cfg->factors;
1359  c2c_state.twiddles = cfg->twiddles;
1360  c2c_state.buffer = tmpbuf2;
1361 
1362  ne10_fft_split_c2r_1d_int32_neon (tmpbuf1, fin, cfg->super_twiddles, cfg->ncfft, scaled_flag);
1363  ne10_fft_c2c_1d_int32_neon ( (ne10_fft_cpx_int32_t*) fout, tmpbuf1, &c2c_state, 1, scaled_flag);
1364 }
#define NE10_FFT_ALG_DEFAULT
Definition: NE10_fft.h:57
void ne10_fft_c2c_1d_int32_neon(ne10_fft_cpx_int32_t *fout, ne10_fft_cpx_int32_t *fin, ne10_fft_cfg_int32_t cfg, ne10_int32_t inverse_fft, ne10_int32_t scaled_flag)
Specific implementation of ne10_fft_c2c_1d_int32 using NEON SIMD capabilities.
ne10_fft_cpx_int32_t * twiddles
Definition: NE10_types.h:335
void ne10_mixed_radix_fft_backward_int32_scaled_neon(ne10_fft_cpx_int32_t *Fout, ne10_fft_cpx_int32_t *fin, ne10_int32_t *factors, ne10_fft_cpx_int32_t *twiddles, ne10_fft_cpx_int32_t *buffer) asm("ne10_mixed_radix_fft_backward_int32_scaled_neon")
void ne10_fft_c2r_1d_int32_neon(ne10_int32_t *fout, ne10_fft_cpx_int32_t *fin, ne10_fft_r2c_cfg_int32_t cfg, ne10_int32_t scaled_flag)
Specific implementation of ne10_fft_c2r_1d_int32 using NEON SIMD capabilities.
int32_t ne10_int32_t
Definition: NE10_types.h:76
void ne10_mixed_radix_fft_forward_int32_unscaled_neon(ne10_fft_cpx_int32_t *Fout, ne10_fft_cpx_int32_t *fin, ne10_int32_t *factors, ne10_fft_cpx_int32_t *twiddles, ne10_fft_cpx_int32_t *buffer) asm("ne10_mixed_radix_fft_forward_int32_unscaled_neon")
ne10_int32_t * factors
Definition: NE10_types.h:346
int64_t ne10_int64_t
Definition: NE10_types.h:78
#define NE10_F2I32_SAMPPROD
Definition: NE10_macros.h:83
#define NE10_F2I32_FIXDIV(c, div)
Definition: NE10_macros.h:87
void ne10_fft_r2c_1d_int32_neon(ne10_fft_cpx_int32_t *fout, ne10_int32_t *fin, ne10_fft_r2c_cfg_int32_t cfg, ne10_int32_t scaled_flag)
Specific implementation of ne10_fft_r2c_1d_int32 using NEON SIMD capabilities.
void ne10_fft_c2c_1d_int32_c(ne10_fft_cpx_int32_t *fout, ne10_fft_cpx_int32_t *fin, ne10_fft_cfg_int32_t cfg, ne10_int32_t inverse_fft, ne10_int32_t scaled_flag)
Specific implementation of ne10_fft_c2c_1d_int32 using plain C.
void ne10_mixed_radix_fft_backward_int32_unscaled_neon(ne10_fft_cpx_int32_t *Fout, ne10_fft_cpx_int32_t *fin, ne10_int32_t *factors, ne10_fft_cpx_int32_t *twiddles, ne10_fft_cpx_int32_t *buffer) asm("ne10_mixed_radix_fft_backward_int32_unscaled_neon")
Structure for the 32-bit fixed point FFT function.
Definition: NE10_types.h:325
ne10_int32_t i
Definition: NE10_types.h:328
ne10_int32_t * factors
Definition: NE10_types.h:334
void ne10_mixed_radix_generic_butterfly_inverse_int32_neon(ne10_fft_cpx_int32_t *Fout, const ne10_fft_cpx_int32_t *Fin, const ne10_int32_t *factors, const ne10_fft_cpx_int32_t *twiddles, ne10_fft_cpx_int32_t *buffer, const ne10_int32_t scaled_flag)
#define NE10_FFT_ALG_ANY
Definition: NE10_fft.h:58
ne10_fft_cpx_int32_t * twiddles
Definition: NE10_types.h:347
void ne10_mixed_radix_generic_butterfly_int32_neon(ne10_fft_cpx_int32_t *Fout, const ne10_fft_cpx_int32_t *Fin, const ne10_int32_t *factors, const ne10_fft_cpx_int32_t *twiddles, ne10_fft_cpx_int32_t *buffer, const ne10_int32_t scaled_flag)
ne10_fft_cpx_int32_t * buffer
Definition: NE10_types.h:336
ne10_fft_cpx_int32_t * buffer
Definition: NE10_types.h:349
ne10_int32_t r
Definition: NE10_types.h:327
ne10_fft_cpx_int32_t * super_twiddles
Definition: NE10_types.h:348
void ne10_mixed_radix_fft_forward_int32_scaled_neon(ne10_fft_cpx_int32_t *Fout, ne10_fft_cpx_int32_t *fin, ne10_int32_t *factors, ne10_fft_cpx_int32_t *twiddles, ne10_fft_cpx_int32_t *buffer) asm("ne10_mixed_radix_fft_forward_int32_scaled_neon")