Project Ne10
An open, optimized software library for the ARM architecture.
NE10_fft_int32.neonintrinsic.c
Go to the documentation of this file.
1 /*
2  * Copyright 2013-16 ARM Limited and Contributors.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of ARM Limited nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /*
29  * NE10 Library : dsp/NE10_fft_int32.neon.c
30  */
31 
32 #include <arm_neon.h>
33 
34 #include "NE10_types.h"
35 #include "NE10_macros.h"
36 #include "NE10_fft.h"
37 #include "NE10_dsp.h"
38 
39 #define FFT4_FS_START \
40  ne10_int32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i; \
41  ne10_int32_t tmp_r, tmp_i;
42 
43 
44 #define FFT4_FS \
45  s2_r = Fin[0].r - Fin[2].r; \
46  s2_i = Fin[0].i - Fin[2].i; \
47  tmp_r = Fin[0].r + Fin[2].r; \
48  tmp_i = Fin[0].i + Fin[2].i; \
49  s0_r = Fin[1].r + Fin[3].r; \
50  s0_i = Fin[1].i + Fin[3].i; \
51  s1_r = Fin[1].r - Fin[3].r; \
52  s1_i = Fin[1].i - Fin[3].i;
53 
54 #define FFT4_FS_SCALED \
55  s2_r = (Fin[0].r - Fin[2].r) >> 2; \
56  s2_i = (Fin[0].i - Fin[2].i) >> 2; \
57  tmp_r = (Fin[0].r + Fin[2].r) >> 2; \
58  tmp_i = (Fin[0].i + Fin[2].i) >> 2; \
59  s0_r = (Fin[1].r + Fin[3].r) >> 2; \
60  s0_i = (Fin[1].i + Fin[3].i) >> 2; \
61  s1_r = (Fin[1].r - Fin[3].r) >> 2; \
62  s1_i = (Fin[1].i - Fin[3].i) >> 2;
63 
64 #define FFT4_FWD_LS \
65  Fout[2].r = tmp_r - s0_r; \
66  Fout[2].i = tmp_i - s0_i; \
67  Fout[0].r = tmp_r + s0_r; \
68  Fout[0].i = tmp_i + s0_i; \
69  Fout[1].r = s2_r + s1_i; \
70  Fout[1].i = s2_i - s1_r; \
71  Fout[3].r = s2_r - s1_i; \
72  Fout[3].i = s2_i + s1_r;
73 
74 #define FFT4_INV_LS \
75  Fout[2].r = tmp_r - s0_r; \
76  Fout[2].i = tmp_i - s0_i; \
77  Fout[0].r = tmp_r + s0_r; \
78  Fout[0].i = tmp_i + s0_i; \
79  Fout[1].r = s2_r - s1_i; \
80  Fout[1].i = s2_i + s1_r; \
81  Fout[3].r = s2_r + s1_i; \
82  Fout[3].i = s2_i - s1_r;
83 
84 static inline void ne10_fft4_forward_int32_unscaled (ne10_fft_cpx_int32_t * Fout,
86 
87 {
89  FFT4_FS
91 }
92 
93 static inline void ne10_fft4_backward_int32_unscaled (ne10_fft_cpx_int32_t * Fout,
95 
96 {
98  FFT4_FS
100 }
101 static inline void ne10_fft4_forward_int32_scaled (ne10_fft_cpx_int32_t * Fout,
102  ne10_fft_cpx_int32_t * Fin)
103 
104 {
108 }
109 
110 static inline void ne10_fft4_backward_int32_scaled (ne10_fft_cpx_int32_t * Fout,
111  ne10_fft_cpx_int32_t * Fin)
112 
113 {
117 }
118 
119 #define FFT8_FS_START \
120  ne10_int32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i; \
121  ne10_int32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i; \
122  const ne10_int32_t TW_81 = 1518500249;
123 
124 #define FFT8_FS \
125  s0_r = Fin[0].r + Fin[4].r; \
126  s0_i = Fin[0].i + Fin[4].i; \
127  s1_r = Fin[0].r - Fin[4].r; \
128  s1_i = Fin[0].i - Fin[4].i; \
129  s2_r = Fin[1].r + Fin[5].r; \
130  s2_i = Fin[1].i + Fin[5].i; \
131  s3_r = Fin[1].r - Fin[5].r; \
132  s3_i = Fin[1].i - Fin[5].i; \
133  s4_r = Fin[2].r + Fin[6].r; \
134  s4_i = Fin[2].i + Fin[6].i; \
135  s5_r = Fin[2].r - Fin[6].r; \
136  s5_i = Fin[2].i - Fin[6].i; \
137  s6_r = Fin[3].r + Fin[7].r; \
138  s6_i = Fin[3].i + Fin[7].i; \
139  s7_r = Fin[3].r - Fin[7].r; \
140  s7_i = Fin[3].i - Fin[7].i;
141 
142 #define FFT8_FS_SCALED \
143  s0_r = (Fin[0].r + Fin[4].r) >> 3; \
144  s0_i = (Fin[0].i + Fin[4].i) >> 3; \
145  s1_r = (Fin[0].r - Fin[4].r) >> 3; \
146  s1_i = (Fin[0].i - Fin[4].i) >> 3; \
147  s2_r = (Fin[1].r + Fin[5].r) >> 3; \
148  s2_i = (Fin[1].i + Fin[5].i) >> 3; \
149  s3_r = (Fin[1].r - Fin[5].r) >> 3; \
150  s3_i = (Fin[1].i - Fin[5].i) >> 3; \
151  s4_r = (Fin[2].r + Fin[6].r) >> 3; \
152  s4_i = (Fin[2].i + Fin[6].i) >> 3; \
153  s5_r = (Fin[2].r - Fin[6].r) >> 3; \
154  s5_i = (Fin[2].i - Fin[6].i) >> 3; \
155  s6_r = (Fin[3].r + Fin[7].r) >> 3; \
156  s6_i = (Fin[3].i + Fin[7].i) >> 3; \
157  s7_r = (Fin[3].r - Fin[7].r) >> 3; \
158  s7_i = (Fin[3].i - Fin[7].i) >> 3;
159 
160 
161 #define FFT8_FWD_LS \
162  t0_r = s0_r - s4_r; \
163  t0_i = s0_i - s4_i; \
164  t1_r = s0_r + s4_r; \
165  t1_i = s0_i + s4_i; \
166  t2_r = s2_r + s6_r; \
167  t2_i = s2_i + s6_i; \
168  t3_r = s2_r - s6_r; \
169  t3_i = s2_i - s6_i; \
170  Fout[0].r = t1_r + t2_r; \
171  Fout[0].i = t1_i + t2_i; \
172  Fout[4].r = t1_r - t2_r; \
173  Fout[4].i = t1_i - t2_i; \
174  Fout[2].r = t0_r + t3_i; \
175  Fout[2].i = t0_i - t3_r; \
176  Fout[6].r = t0_r - t3_i; \
177  Fout[6].i = t0_i + t3_r; \
178  t4_r = (ne10_int32_t) ( ( (ne10_int64_t) (s3_r + s3_i) * TW_81) >> 31); \
179  t4_i = - (ne10_int32_t) ( ( (ne10_int64_t) (s3_r - s3_i) * TW_81) >> 31); \
180  t5_r = (ne10_int32_t) ( ( (ne10_int64_t) (s7_r - s7_i) * TW_81) >> 31); \
181  t5_i = (ne10_int32_t) ( ( (ne10_int64_t) (s7_r + s7_i) * TW_81) >> 31); \
182  t0_r = s1_r - s5_i; \
183  t0_i = s1_i + s5_r; \
184  t1_r = s1_r + s5_i; \
185  t1_i = s1_i - s5_r; \
186  t2_r = t4_r - t5_r; \
187  t2_i = t4_i - t5_i; \
188  t3_r = t4_r + t5_r; \
189  t3_i = t4_i + t5_i; \
190  Fout[1].r = t1_r + t2_r; \
191  Fout[1].i = t1_i + t2_i; \
192  Fout[5].r = t1_r - t2_r; \
193  Fout[5].i = t1_i - t2_i; \
194  Fout[3].r = t0_r + t3_i; \
195  Fout[3].i = t0_i - t3_r; \
196  Fout[7].r = t0_r - t3_i; \
197  Fout[7].i = t0_i + t3_r;
198 
199 #define FFT8_INV_LS \
200  t0_r = s0_r - s4_r; \
201  t0_i = s0_i - s4_i; \
202  t1_r = s0_r + s4_r; \
203  t1_i = s0_i + s4_i; \
204  t2_r = s2_r + s6_r; \
205  t2_i = s2_i + s6_i; \
206  t3_r = s2_r - s6_r; \
207  t3_i = s2_i - s6_i; \
208  Fout[0].r = t1_r + t2_r; \
209  Fout[0].i = t1_i + t2_i; \
210  Fout[4].r = t1_r - t2_r; \
211  Fout[4].i = t1_i - t2_i; \
212  Fout[2].r = t0_r - t3_i; \
213  Fout[2].i = t0_i + t3_r; \
214  Fout[6].r = t0_r + t3_i; \
215  Fout[6].i = t0_i - t3_r; \
216  t4_r = (ne10_int32_t) ( ( (ne10_int64_t) (s3_r - s3_i) * TW_81) >> 31); \
217  t4_i = (ne10_int32_t) ( ( (ne10_int64_t) (s3_r + s3_i) * TW_81) >> 31); \
218  t5_r = (ne10_int32_t) ( ( (ne10_int64_t) (s7_r + s7_i) * TW_81) >> 31); \
219  t5_i = - (ne10_int32_t) ( ( (ne10_int64_t) (s7_r - s7_i) * TW_81) >> 31); \
220  t0_r = s1_r + s5_i; \
221  t0_i = s1_i - s5_r; \
222  t1_r = s1_r - s5_i; \
223  t1_i = s1_i + s5_r; \
224  t2_r = t4_r - t5_r; \
225  t2_i = t4_i - t5_i; \
226  t3_r = t4_r + t5_r; \
227  t3_i = t4_i + t5_i; \
228  Fout[1].r = t1_r + t2_r; \
229  Fout[1].i = t1_i + t2_i; \
230  Fout[5].r = t1_r - t2_r; \
231  Fout[5].i = t1_i - t2_i; \
232  Fout[3].r = t0_r - t3_i; \
233  Fout[3].i = t0_i + t3_r; \
234  Fout[7].r = t0_r + t3_i; \
235  Fout[7].i = t0_i - t3_r;
236 
237 static inline void ne10_fft8_forward_int32_unscaled (ne10_fft_cpx_int32_t * Fout,
238  ne10_fft_cpx_int32_t * Fin)
239 
240 {
242  FFT8_FS
244 }
245 
246 static inline void ne10_fft8_backward_int32_unscaled (ne10_fft_cpx_int32_t * Fout,
247  ne10_fft_cpx_int32_t * Fin)
248 
249 {
251  FFT8_FS
253 }
254 static inline void ne10_fft8_forward_int32_scaled (ne10_fft_cpx_int32_t * Fout,
255  ne10_fft_cpx_int32_t * Fin)
256 
257 {
261 }
262 
263 static inline void ne10_fft8_backward_int32_scaled (ne10_fft_cpx_int32_t * Fout,
264  ne10_fft_cpx_int32_t * Fin)
265 
266 {
270 }
271 #define FFT16_FS_START \
272  ne10_fft_cpx_int32_t *tw1, *tw2, *tw3; \
273  int32_t *p_src0, *p_src4, *p_src8, *p_src12; \
274  int32x4x2_t q2_in_0123, q2_in_4567, q2_in_89ab, q2_in_cdef; \
275  int32x4_t q_t0_r, q_t0_i, q_t1_r, q_t1_i, q_t2_r, q_t2_i, q_t3_r, q_t3_i; \
276  int32x4_t q_out_r048c, q_out_i048c, q_out_r159d, q_out_i159d; \
277  int32x4_t q_out_r26ae, q_out_i26ae, q_out_r37bf, q_out_i37bf;
278 
279 #define FFT16_LS_START \
280  int32_t *p_dst0, *p_dst1, *p_dst2, *p_dst3; \
281  int32_t *p_tw1, *p_tw2, *p_tw3; \
282  int32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i; \
283  int32x4_t q_s3_r, q_s3_i, q_s4_r, q_s4_i, q_s5_r, q_s5_i; \
284  int32x4x2_t q2_tmp_0, q2_tmp_1, q2_tmp_2, q2_tmp_3; \
285  int32x4_t q_in_r0123, q_in_r4567, q_in_r89ab, q_in_rcdef; \
286  int32x4_t q_in_i0123, q_in_i4567, q_in_i89ab, q_in_icdef; \
287  int32x4x2_t q2_tw1, q2_tw2, q2_tw3; \
288  int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3, q_tmp4, q_tmp5; \
289  int32x4x2_t q2_out_0123, q2_out_4567, q2_out_89ab, q2_out_cdef;
290 
291 #define FFT16_FS \
292  p_src0 = (int32_t*) (& (Fin[0])); \
293  p_src4 = (int32_t*) (& (Fin[4])); \
294  p_src8 = (int32_t*) (& (Fin[8])); \
295  p_src12 = (int32_t*) (& (Fin[12])); \
296  q2_in_0123 = vld2q_s32 (p_src0); \
297  q2_in_4567 = vld2q_s32 (p_src4); \
298  q2_in_89ab = vld2q_s32 (p_src8); \
299  q2_in_cdef = vld2q_s32 (p_src12); \
300  q_t2_r = vsubq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]); \
301  q_t2_i = vsubq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]); \
302  q_t3_r = vaddq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]); \
303  q_t3_i = vaddq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]); \
304  q_t0_r = vaddq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]); \
305  q_t0_i = vaddq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]); \
306  q_t1_r = vsubq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]); \
307  q_t1_i = vsubq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]); \
308  q_out_r26ae = vsubq_s32 (q_t3_r, q_t0_r); \
309  q_out_i26ae = vsubq_s32 (q_t3_i, q_t0_i); \
310  q_out_r048c = vaddq_s32 (q_t3_r, q_t0_r); \
311  q_out_i048c = vaddq_s32 (q_t3_i, q_t0_i);
312 
313 #define FFT16_FS_SCALED \
314  p_src0 = (int32_t*) (& (Fin[0])); \
315  p_src4 = (int32_t*) (& (Fin[4])); \
316  p_src8 = (int32_t*) (& (Fin[8])); \
317  p_src12 = (int32_t*) (& (Fin[12])); \
318  q2_in_0123 = vld2q_s32 (p_src0); \
319  q2_in_4567 = vld2q_s32 (p_src4); \
320  q2_in_89ab = vld2q_s32 (p_src8); \
321  q2_in_cdef = vld2q_s32 (p_src12); \
322  q_t2_r = vhsubq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]); \
323  q_t2_i = vhsubq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]); \
324  q_t3_r = vhaddq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]); \
325  q_t3_i = vhaddq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]); \
326  q_t0_r = vhaddq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]); \
327  q_t0_i = vhaddq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]); \
328  q_t1_r = vhsubq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]); \
329  q_t1_i = vhsubq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]); \
330  q_out_r26ae = vhsubq_s32 (q_t3_r, q_t0_r); \
331  q_out_i26ae = vhsubq_s32 (q_t3_i, q_t0_i); \
332  q_out_r048c = vhaddq_s32 (q_t3_r, q_t0_r); \
333  q_out_i048c = vhaddq_s32 (q_t3_i, q_t0_i);
334 
335 #define FFT16_LS_LOAD \
336  tw1 = twiddles; \
337  tw2 = twiddles + 4; \
338  tw3 = twiddles + 8; \
339  p_dst0 = (int32_t*) (&Fout[0]); \
340  p_dst1 = (int32_t*) (&Fout[4]); \
341  p_dst2 = (int32_t*) (&Fout[8]); \
342  p_dst3 = (int32_t*) (&Fout[12]); \
343  p_tw1 = (int32_t*) tw1; \
344  p_tw2 = (int32_t*) tw2; \
345  p_tw3 = (int32_t*) tw3; \
346  q2_tmp_0 = vzipq_s32 (q_out_r048c, q_out_r159d); \
347  q2_tmp_1 = vzipq_s32 (q_out_i048c, q_out_i159d); \
348  q2_tmp_2 = vzipq_s32 (q_out_r26ae, q_out_r37bf); \
349  q2_tmp_3 = vzipq_s32 (q_out_i26ae, q_out_i37bf); \
350  q_in_r0123 = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[0]), vget_low_s32 (q2_tmp_2.val[0])); \
351  q_in_i0123 = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[0]), vget_low_s32 (q2_tmp_3.val[0])); \
352  q_in_r4567 = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[0]), vget_high_s32 (q2_tmp_2.val[0])); \
353  q_in_i4567 = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[0]), vget_high_s32 (q2_tmp_3.val[0])); \
354  q_in_r89ab = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[1]), vget_low_s32 (q2_tmp_2.val[1])); \
355  q_in_i89ab = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[1]), vget_low_s32 (q2_tmp_3.val[1])); \
356  q_in_rcdef = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[1]), vget_high_s32 (q2_tmp_2.val[1])); \
357  q_in_icdef = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[1]), vget_high_s32 (q2_tmp_3.val[1])); \
358  q2_tw1 = vld2q_s32 (p_tw1); \
359  q2_tw2 = vld2q_s32 (p_tw2); \
360  q2_tw3 = vld2q_s32 (p_tw3);
361 
362 #define FFT16_FWD_LS \
363  q_s0_r = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[0]); \
364  q_s0_i = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[1]); \
365  q_s1_r = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[0]); \
366  q_s1_i = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[1]); \
367  q_s2_r = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[0]); \
368  q_s2_i = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[1]); \
369  q_tmp0 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[1]); \
370  q_tmp1 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[0]); \
371  q_tmp2 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[1]); \
372  q_tmp3 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[0]); \
373  q_tmp4 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[1]); \
374  q_tmp5 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[0]);
375 
376 #define FFT16_INV_LS \
377  q_s0_r = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[0]); \
378  q_s0_i = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[0]); \
379  q_s1_r = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[0]); \
380  q_s1_i = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[0]); \
381  q_s2_r = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[0]); \
382  q_s2_i = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[0]); \
383  q_tmp0 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[1]); \
384  q_tmp1 = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[1]); \
385  q_tmp2 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[1]); \
386  q_tmp3 = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[1]); \
387  q_tmp4 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[1]); \
388  q_tmp5 = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[1]);
389 
390 #define FFT16_FWD_LS_S0 \
391  q_s0_r = vsubq_s32 (q_s0_r, q_tmp0); \
392  q_s0_i = vaddq_s32 (q_s0_i, q_tmp1); \
393  q_s1_r = vsubq_s32 (q_s1_r, q_tmp2); \
394  q_s1_i = vaddq_s32 (q_s1_i, q_tmp3); \
395  q_s2_r = vsubq_s32 (q_s2_r, q_tmp4); \
396  q_s2_i = vaddq_s32 (q_s2_i, q_tmp5);
397 
398 #define FFT16_INV_LS_S0 \
399  q_s0_r = vaddq_s32 (q_s0_r, q_tmp0); \
400  q_s0_i = vsubq_s32 (q_s0_i, q_tmp1); \
401  q_s1_r = vaddq_s32 (q_s1_r, q_tmp2); \
402  q_s1_i = vsubq_s32 (q_s1_i, q_tmp3); \
403  q_s2_r = vaddq_s32 (q_s2_r, q_tmp4); \
404  q_s2_i = vsubq_s32 (q_s2_i, q_tmp5);
405 
406 #define FFT16_LS_02 \
407  q_s5_r = vsubq_s32 (q_in_r0123, q_s1_r); \
408  q_s5_i = vsubq_s32 (q_in_i0123, q_s1_i); \
409  q2_out_0123.val[0] = vaddq_s32 (q_in_r0123, q_s1_r); \
410  q2_out_0123.val[1] = vaddq_s32 (q_in_i0123, q_s1_i); \
411  q_s3_r = vaddq_s32 (q_s0_r, q_s2_r); \
412  q_s3_i = vaddq_s32 (q_s0_i, q_s2_i); \
413  q_s4_r = vsubq_s32 (q_s0_r, q_s2_r); \
414  q_s4_i = vsubq_s32 (q_s0_i, q_s2_i); \
415  q2_out_89ab.val[0] = vsubq_s32 (q2_out_0123.val[0], q_s3_r); \
416  q2_out_89ab.val[1] = vsubq_s32 (q2_out_0123.val[1], q_s3_i); \
417  q2_out_0123.val[0] = vaddq_s32 (q2_out_0123.val[0], q_s3_r); \
418  q2_out_0123.val[1] = vaddq_s32 (q2_out_0123.val[1], q_s3_i);
419 
420 
421 #define FFT16_LS_02_SCALED \
422  q_s5_r = vhsubq_s32 (q_in_r0123, q_s1_r); \
423  q_s5_i = vhsubq_s32 (q_in_i0123, q_s1_i); \
424  q2_out_0123.val[0] = vhaddq_s32 (q_in_r0123, q_s1_r); \
425  q2_out_0123.val[1] = vhaddq_s32 (q_in_i0123, q_s1_i); \
426  q_s3_r = vhaddq_s32 (q_s0_r, q_s2_r); \
427  q_s3_i = vhaddq_s32 (q_s0_i, q_s2_i); \
428  q_s4_r = vhsubq_s32 (q_s0_r, q_s2_r); \
429  q_s4_i = vhsubq_s32 (q_s0_i, q_s2_i); \
430  q2_out_89ab.val[0] = vhsubq_s32 (q2_out_0123.val[0], q_s3_r); \
431  q2_out_89ab.val[1] = vhsubq_s32 (q2_out_0123.val[1], q_s3_i); \
432  q2_out_0123.val[0] = vhaddq_s32 (q2_out_0123.val[0], q_s3_r); \
433  q2_out_0123.val[1] = vhaddq_s32 (q2_out_0123.val[1], q_s3_i);
434 
435 #define FFT16_ST \
436  vst2q_s32 (p_dst0, q2_out_0123); \
437  vst2q_s32 (p_dst1, q2_out_4567); \
438  vst2q_s32 (p_dst2, q2_out_89ab); \
439  vst2q_s32 (p_dst3, q2_out_cdef);
440 
441 static void ne10_fft16_forward_int32_unscaled_neon (ne10_fft_cpx_int32_t * Fout,
442  ne10_fft_cpx_int32_t * Fin,
443  ne10_fft_cpx_int32_t * twiddles)
444 {
445  // the first stage
447  FFT16_FS
448  q_out_r159d = vaddq_s32 (q_t2_r, q_t1_i);
449  q_out_i159d = vsubq_s32 (q_t2_i, q_t1_r);
450  q_out_r37bf = vsubq_s32 (q_t2_r, q_t1_i);
451  q_out_i37bf = vaddq_s32 (q_t2_i, q_t1_r);
452 
453  // second stages
459 
460  q2_out_4567.val[0] = vaddq_s32 (q_s5_r, q_s4_i);
461  q2_out_4567.val[1] = vsubq_s32 (q_s5_i, q_s4_r);
462  q2_out_cdef.val[0] = vsubq_s32 (q_s5_r, q_s4_i);
463  q2_out_cdef.val[1] = vaddq_s32 (q_s5_i, q_s4_r);
464 
465  FFT16_ST
466 }
467 
468 static void ne10_fft16_backward_int32_unscaled_neon (ne10_fft_cpx_int32_t * Fout,
469  ne10_fft_cpx_int32_t * Fin,
470  ne10_fft_cpx_int32_t * twiddles)
471 {
472  // the first stage
474  FFT16_FS
475  q_out_r159d = vsubq_s32 (q_t2_r, q_t1_i);
476  q_out_i159d = vaddq_s32 (q_t2_i, q_t1_r);
477  q_out_r37bf = vaddq_s32 (q_t2_r, q_t1_i);
478  q_out_i37bf = vsubq_s32 (q_t2_i, q_t1_r);
479 
480  // second stages
486 
487  q2_out_4567.val[0] = vsubq_s32 (q_s5_r, q_s4_i);
488  q2_out_4567.val[1] = vaddq_s32 (q_s5_i, q_s4_r);
489  q2_out_cdef.val[0] = vaddq_s32 (q_s5_r, q_s4_i);
490  q2_out_cdef.val[1] = vsubq_s32 (q_s5_i, q_s4_r);
491 
492  FFT16_ST
493 }
494 
495 static void ne10_fft16_forward_int32_scaled_neon (ne10_fft_cpx_int32_t * Fout,
496  ne10_fft_cpx_int32_t * Fin,
497  ne10_fft_cpx_int32_t * twiddles)
498 {
499  // the first stage
502  q_out_r159d = vhaddq_s32 (q_t2_r, q_t1_i);
503  q_out_i159d = vhsubq_s32 (q_t2_i, q_t1_r);
504  q_out_r37bf = vhsubq_s32 (q_t2_r, q_t1_i);
505  q_out_i37bf = vhaddq_s32 (q_t2_i, q_t1_r);
506 
507  // second stages
513 
514  q2_out_4567.val[0] = vhaddq_s32 (q_s5_r, q_s4_i);
515  q2_out_4567.val[1] = vhsubq_s32 (q_s5_i, q_s4_r);
516  q2_out_cdef.val[0] = vhsubq_s32 (q_s5_r, q_s4_i);
517  q2_out_cdef.val[1] = vhaddq_s32 (q_s5_i, q_s4_r);
518 
519  FFT16_ST
520 }
521 
522 static void ne10_fft16_backward_int32_scaled_neon (ne10_fft_cpx_int32_t * Fout,
523  ne10_fft_cpx_int32_t * Fin,
524  ne10_fft_cpx_int32_t * twiddles)
525 {
526  // the first stage
529  q_out_r159d = vhsubq_s32 (q_t2_r, q_t1_i);
530  q_out_i159d = vhaddq_s32 (q_t2_i, q_t1_r);
531  q_out_r37bf = vhaddq_s32 (q_t2_r, q_t1_i);
532  q_out_i37bf = vhsubq_s32 (q_t2_i, q_t1_r);
533 
534  // second stages
540 
541  q2_out_4567.val[0] = vhsubq_s32 (q_s5_r, q_s4_i);
542  q2_out_4567.val[1] = vhaddq_s32 (q_s5_i, q_s4_r);
543  q2_out_cdef.val[0] = vhaddq_s32 (q_s5_r, q_s4_i);
544  q2_out_cdef.val[1] = vhsubq_s32 (q_s5_i, q_s4_r);
545 
546  FFT16_ST
547 }
548 
549 
550 #define RADIX8x4_START \
551  ne10_int32_t f_count; \
552  ne10_int32_t src_step = stride << 1; \
553  const ne10_int32_t TW_81 = 1518500249; \
554  const ne10_int32_t TW_81N = -1518500249; \
555  int32_t *p_src, *p_dst; \
556  int32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3, q2_in4, q2_in5, q2_in6, q2_in7; \
557  int32x4_t q_sin0_r, q_sin0_i, q_sin1_r, q_sin1_i, q_sin2_r, q_sin2_i, q_sin3_r, q_sin3_i; \
558  int32x4_t q_sin4_r, q_sin4_i, q_sin5_r, q_sin5_i, q_sin6_r, q_sin6_i, q_sin7_r, q_sin7_i; \
559  int32x4_t q_s3_r, q_s3_i, q_s5_r, q_s5_i, q_s7_r, q_s7_i; \
560  int32x4_t q_s8_r, q_s8_i, q_s9_r, q_s9_i, q_s10_r, q_s10_i, q_s11_r, q_s11_i; \
561  int32x4_t q_s12_r, q_s12_i, q_s13_r, q_s13_i, q_s14_r, q_s14_i, q_s15_r, q_s15_i; \
562  int32x4_t q_out0_r, q_out0_i, q_out1_r, q_out1_i, q_out2_r, q_out2_i, q_out3_r, q_out3_i; \
563  int32x4_t q_out4_r, q_out4_i, q_out5_r, q_out5_i, q_out6_r, q_out6_i, q_out7_r, q_out7_i; \
564  int32x4x2_t q2_tmp0, q2_tmp1, q2_tmp2, q2_tmp3, q2_tmp4, q2_tmp5, q2_tmp6, q2_tmp7; \
565  int32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3, q2_out4, q2_out5, q2_out6, q2_out7; \
566  int32x4_t q_tw_81, q_tw_81n; \
567  p_src = (int32_t *) Fin; \
568  p_dst = (int32_t *) Fout;
569 
570 
571 #define RADIX8x4_LOAD \
572  q2_in0 = vld2q_s32 (p_src); \
573  p_src += src_step; \
574  q2_in2 = vld2q_s32 (p_src); \
575  p_src += src_step; \
576  q2_in4 = vld2q_s32 (p_src); \
577  p_src += src_step; \
578  q2_in6 = vld2q_s32 (p_src); \
579  p_src += src_step; \
580  q2_in1 = vld2q_s32 (p_src); \
581  p_src += src_step; \
582  q2_in3 = vld2q_s32 (p_src); \
583  p_src += src_step; \
584  q2_in5 = vld2q_s32 (p_src); \
585  p_src += src_step; \
586  q2_in7 = vld2q_s32 (p_src); \
587  p_src += src_step;
588 
589 #define RADIX8x4_STORE \
590  q2_tmp0 = vtrnq_s32 (q_out0_r, q_out1_r); \
591  q2_tmp1 = vtrnq_s32 (q_out0_i, q_out1_i); \
592  q2_tmp2 = vtrnq_s32 (q_out2_r, q_out3_r); \
593  q2_tmp3 = vtrnq_s32 (q_out2_i, q_out3_i); \
594  q2_tmp4 = vtrnq_s32 (q_out4_r, q_out5_r); \
595  q2_tmp5 = vtrnq_s32 (q_out4_i, q_out5_i); \
596  q2_tmp6 = vtrnq_s32 (q_out6_r, q_out7_r); \
597  q2_tmp7 = vtrnq_s32 (q_out6_i, q_out7_i); \
598  q2_out0.val[0] = vcombine_s32 (vget_low_s32 (q2_tmp0.val[0]), vget_low_s32 (q2_tmp2.val[0])); \
599  q2_out0.val[1] = vcombine_s32 (vget_low_s32 (q2_tmp1.val[0]), vget_low_s32 (q2_tmp3.val[0])); \
600  q2_out2.val[0] = vcombine_s32 (vget_low_s32 (q2_tmp0.val[1]), vget_low_s32 (q2_tmp2.val[1])); \
601  q2_out2.val[1] = vcombine_s32 (vget_low_s32 (q2_tmp1.val[1]), vget_low_s32 (q2_tmp3.val[1])); \
602  q2_out4.val[0] = vcombine_s32 (vget_high_s32 (q2_tmp0.val[0]), vget_high_s32 (q2_tmp2.val[0])); \
603  q2_out4.val[1] = vcombine_s32 (vget_high_s32 (q2_tmp1.val[0]), vget_high_s32 (q2_tmp3.val[0])); \
604  q2_out6.val[0] = vcombine_s32 (vget_high_s32 (q2_tmp0.val[1]), vget_high_s32 (q2_tmp2.val[1])); \
605  q2_out6.val[1] = vcombine_s32 (vget_high_s32 (q2_tmp1.val[1]), vget_high_s32 (q2_tmp3.val[1])); \
606  q2_out1.val[0] = vcombine_s32 (vget_low_s32 (q2_tmp4.val[0]), vget_low_s32 (q2_tmp6.val[0])); \
607  q2_out1.val[1] = vcombine_s32 (vget_low_s32 (q2_tmp5.val[0]), vget_low_s32 (q2_tmp7.val[0])); \
608  q2_out3.val[0] = vcombine_s32 (vget_low_s32 (q2_tmp4.val[1]), vget_low_s32 (q2_tmp6.val[1])); \
609  q2_out3.val[1] = vcombine_s32 (vget_low_s32 (q2_tmp5.val[1]), vget_low_s32 (q2_tmp7.val[1])); \
610  q2_out5.val[0] = vcombine_s32 (vget_high_s32 (q2_tmp4.val[0]), vget_high_s32 (q2_tmp6.val[0])); \
611  q2_out5.val[1] = vcombine_s32 (vget_high_s32 (q2_tmp5.val[0]), vget_high_s32 (q2_tmp7.val[0])); \
612  q2_out7.val[0] = vcombine_s32 (vget_high_s32 (q2_tmp4.val[1]), vget_high_s32 (q2_tmp6.val[1])); \
613  q2_out7.val[1] = vcombine_s32 (vget_high_s32 (q2_tmp5.val[1]), vget_high_s32 (q2_tmp7.val[1])); \
614  vst2q_s32 (p_dst, q2_out0); \
615  p_dst += 8; \
616  vst2q_s32 (p_dst, q2_out1); \
617  p_dst += 8; \
618  vst2q_s32 (p_dst, q2_out2); \
619  p_dst += 8; \
620  vst2q_s32 (p_dst, q2_out3); \
621  p_dst += 8; \
622  vst2q_s32 (p_dst, q2_out4); \
623  p_dst += 8; \
624  vst2q_s32 (p_dst, q2_out5); \
625  p_dst += 8; \
626  vst2q_s32 (p_dst, q2_out6); \
627  p_dst += 8; \
628  vst2q_s32 (p_dst, q2_out7); \
629  p_dst += 8; \
630  p_src = p_src - src_step * 8 + 8;
631 
632 #define RADIX8x4_FS_S0 \
633  q_sin0_r = vaddq_s32 (q2_in0.val[0], q2_in1.val[0]); \
634  q_sin0_i = vaddq_s32 (q2_in0.val[1], q2_in1.val[1]); \
635  q_sin1_r = vsubq_s32 (q2_in0.val[0], q2_in1.val[0]); \
636  q_sin1_i = vsubq_s32 (q2_in0.val[1], q2_in1.val[1]); \
637  q_sin2_r = vaddq_s32 (q2_in2.val[0], q2_in3.val[0]); \
638  q_sin2_i = vaddq_s32 (q2_in2.val[1], q2_in3.val[1]); \
639  q_sin3_r = vsubq_s32 (q2_in2.val[0], q2_in3.val[0]); \
640  q_sin3_i = vsubq_s32 (q2_in2.val[1], q2_in3.val[1]); \
641  q_sin4_r = vaddq_s32 (q2_in4.val[0], q2_in5.val[0]); \
642  q_sin4_i = vaddq_s32 (q2_in4.val[1], q2_in5.val[1]); \
643  q_sin5_r = vsubq_s32 (q2_in4.val[0], q2_in5.val[0]); \
644  q_sin5_i = vsubq_s32 (q2_in4.val[1], q2_in5.val[1]); \
645  q_sin6_r = vaddq_s32 (q2_in6.val[0], q2_in7.val[0]); \
646  q_sin6_i = vaddq_s32 (q2_in6.val[1], q2_in7.val[1]); \
647  q_sin7_r = vsubq_s32 (q2_in6.val[0], q2_in7.val[0]); \
648  q_sin7_i = vsubq_s32 (q2_in6.val[1], q2_in7.val[1]);
649 
650 #define RADIX8x4_FWD_S357 \
651  q_tw_81 = vdupq_n_s32 (TW_81); \
652  q_tw_81n = vdupq_n_s32 (TW_81N); \
653  q_s5_r = q_sin5_i; \
654  q_s5_i = vnegq_s32 (q_sin5_r); \
655  q_s3_r = vaddq_s32 (q_sin3_r, q_sin3_i); \
656  q_s3_i = vsubq_s32 (q_sin3_i, q_sin3_r); \
657  q_s7_r = vsubq_s32 (q_sin7_r, q_sin7_i); \
658  q_s7_i = vaddq_s32 (q_sin7_i, q_sin7_r); \
659  q_s3_r = vqdmulhq_s32 (q_s3_r, q_tw_81); \
660  q_s3_i = vqdmulhq_s32 (q_s3_i, q_tw_81); \
661  q_s7_r = vqdmulhq_s32 (q_s7_r, q_tw_81n); \
662  q_s7_i = vqdmulhq_s32 (q_s7_i, q_tw_81n);
663 
664 #define RADIX8x4_INV_S357 \
665  q_tw_81 = vdupq_n_s32 (TW_81); \
666  q_tw_81n = vdupq_n_s32 (TW_81N); \
667  q_s5_r = vnegq_s32 (q_sin5_i); \
668  q_s5_i = q_sin5_r; \
669  q_s3_r = vsubq_s32 (q_sin3_r, q_sin3_i); \
670  q_s3_i = vaddq_s32 (q_sin3_i, q_sin3_r); \
671  q_s7_r = vaddq_s32 (q_sin7_r, q_sin7_i); \
672  q_s7_i = vsubq_s32 (q_sin7_i, q_sin7_r); \
673  q_s3_r = vqdmulhq_s32 (q_s3_r, q_tw_81); \
674  q_s3_i = vqdmulhq_s32 (q_s3_i, q_tw_81); \
675  q_s7_r = vqdmulhq_s32 (q_s7_r, q_tw_81n); \
676  q_s7_i = vqdmulhq_s32 (q_s7_i, q_tw_81n);
677 
678 #define RADIX8x4_LS_02 \
679  q_s8_r = vaddq_s32 (q_sin0_r, q_sin4_r); \
680  q_s8_i = vaddq_s32 (q_sin0_i, q_sin4_i); \
681  q_s9_r = vaddq_s32 (q_sin1_r, q_s5_r); \
682  q_s9_i = vaddq_s32 (q_sin1_i, q_s5_i); \
683  q_s10_r = vsubq_s32 (q_sin0_r, q_sin4_r); \
684  q_s10_i = vsubq_s32 (q_sin0_i, q_sin4_i); \
685  q_s11_r = vsubq_s32 (q_sin1_r, q_s5_r); \
686  q_s11_i = vsubq_s32 (q_sin1_i, q_s5_i); \
687  q_s12_r = vaddq_s32 (q_sin2_r, q_sin6_r); \
688  q_s12_i = vaddq_s32 (q_sin2_i, q_sin6_i); \
689  q_s13_r = vaddq_s32 (q_s3_r, q_s7_r); \
690  q_s13_i = vaddq_s32 (q_s3_i, q_s7_i); \
691  q_s14_r = vsubq_s32 (q_sin2_r, q_sin6_r); \
692  q_s14_i = vsubq_s32 (q_sin2_i, q_sin6_i); \
693  q_s15_r = vsubq_s32 (q_s3_r, q_s7_r); \
694  q_s15_i = vsubq_s32 (q_s3_i, q_s7_i); \
695  q_out4_r = vsubq_s32 (q_s8_r, q_s12_r); \
696  q_out4_i = vsubq_s32 (q_s8_i, q_s12_i); \
697  q_out5_r = vsubq_s32 (q_s9_r, q_s13_r); \
698  q_out5_i = vsubq_s32 (q_s9_i, q_s13_i); \
699  q_out0_r = vaddq_s32 (q_s8_r, q_s12_r); \
700  q_out0_i = vaddq_s32 (q_s8_i, q_s12_i); \
701  q_out1_r = vaddq_s32 (q_s9_r, q_s13_r); \
702  q_out1_i = vaddq_s32 (q_s9_i, q_s13_i);
703 
704 #define RADIX8x4_FS_S0_SCALED \
705  q_sin0_r = vhaddq_s32 (q2_in0.val[0], q2_in1.val[0]); \
706  q_sin0_i = vhaddq_s32 (q2_in0.val[1], q2_in1.val[1]); \
707  q_sin1_r = vhsubq_s32 (q2_in0.val[0], q2_in1.val[0]); \
708  q_sin1_i = vhsubq_s32 (q2_in0.val[1], q2_in1.val[1]); \
709  q_sin2_r = vhaddq_s32 (q2_in2.val[0], q2_in3.val[0]); \
710  q_sin2_i = vhaddq_s32 (q2_in2.val[1], q2_in3.val[1]); \
711  q_sin3_r = vhsubq_s32 (q2_in2.val[0], q2_in3.val[0]); \
712  q_sin3_i = vhsubq_s32 (q2_in2.val[1], q2_in3.val[1]); \
713  q_sin4_r = vhaddq_s32 (q2_in4.val[0], q2_in5.val[0]); \
714  q_sin4_i = vhaddq_s32 (q2_in4.val[1], q2_in5.val[1]); \
715  q_sin5_r = vhsubq_s32 (q2_in4.val[0], q2_in5.val[0]); \
716  q_sin5_i = vhsubq_s32 (q2_in4.val[1], q2_in5.val[1]); \
717  q_sin6_r = vhaddq_s32 (q2_in6.val[0], q2_in7.val[0]); \
718  q_sin6_i = vhaddq_s32 (q2_in6.val[1], q2_in7.val[1]); \
719  q_sin7_r = vhsubq_s32 (q2_in6.val[0], q2_in7.val[0]); \
720  q_sin7_i = vhsubq_s32 (q2_in6.val[1], q2_in7.val[1]);
721 
722 #define RADIX8x4_LS_02_SCALED \
723  q_s8_r = vhaddq_s32 (q_sin0_r, q_sin4_r); \
724  q_s8_i = vhaddq_s32 (q_sin0_i, q_sin4_i); \
725  q_s9_r = vhaddq_s32 (q_sin1_r, q_s5_r); \
726  q_s9_i = vhaddq_s32 (q_sin1_i, q_s5_i); \
727  q_s10_r = vhsubq_s32 (q_sin0_r, q_sin4_r); \
728  q_s10_i = vhsubq_s32 (q_sin0_i, q_sin4_i); \
729  q_s11_r = vhsubq_s32 (q_sin1_r, q_s5_r); \
730  q_s11_i = vhsubq_s32 (q_sin1_i, q_s5_i); \
731  q_s12_r = vhaddq_s32 (q_sin2_r, q_sin6_r); \
732  q_s12_i = vhaddq_s32 (q_sin2_i, q_sin6_i); \
733  q_s13_r = vhaddq_s32 (q_s3_r, q_s7_r); \
734  q_s13_i = vhaddq_s32 (q_s3_i, q_s7_i); \
735  q_s14_r = vhsubq_s32 (q_sin2_r, q_sin6_r); \
736  q_s14_i = vhsubq_s32 (q_sin2_i, q_sin6_i); \
737  q_s15_r = vhsubq_s32 (q_s3_r, q_s7_r); \
738  q_s15_i = vhsubq_s32 (q_s3_i, q_s7_i); \
739  q_out4_r = vhsubq_s32 (q_s8_r, q_s12_r); \
740  q_out4_i = vhsubq_s32 (q_s8_i, q_s12_i); \
741  q_out5_r = vhsubq_s32 (q_s9_r, q_s13_r); \
742  q_out5_i = vhsubq_s32 (q_s9_i, q_s13_i); \
743  q_out0_r = vhaddq_s32 (q_s8_r, q_s12_r); \
744  q_out0_i = vhaddq_s32 (q_s8_i, q_s12_i); \
745  q_out1_r = vhaddq_s32 (q_s9_r, q_s13_r); \
746  q_out1_i = vhaddq_s32 (q_s9_i, q_s13_i);
747 
748 
749 static inline void ne10_radix8x4_forward_unscaled_neon (ne10_fft_cpx_int32_t * Fout,
750  ne10_fft_cpx_int32_t * Fin,
751  ne10_int32_t stride)
752 {
754 
755  for (f_count = 0; f_count < stride; f_count += 4)
756  {
759 
760 
761  // radix 4 butterfly without twiddles
764 
765  q_out2_r = vaddq_s32 (q_s10_r, q_s14_i);
766  q_out2_i = vsubq_s32 (q_s10_i, q_s14_r);
767  q_out3_r = vaddq_s32 (q_s11_r, q_s15_i);
768  q_out3_i = vsubq_s32 (q_s11_i, q_s15_r);
769  q_out6_r = vsubq_s32 (q_s10_r, q_s14_i);
770  q_out6_i = vaddq_s32 (q_s10_i, q_s14_r);
771  q_out7_r = vsubq_s32 (q_s11_r, q_s15_i);
772  q_out7_i = vaddq_s32 (q_s11_i, q_s15_r);
773 
775  } // f_count
776 }
777 
778 static inline void ne10_radix8x4_backward_unscaled_neon (ne10_fft_cpx_int32_t * Fout,
779  ne10_fft_cpx_int32_t * Fin,
780  ne10_int32_t stride)
781 {
783 
784  for (f_count = 0; f_count < stride; f_count += 4)
785  {
788 
789  // radix 4 butterfly without twiddles
792 
793  q_out2_r = vsubq_s32 (q_s10_r, q_s14_i);
794  q_out2_i = vaddq_s32 (q_s10_i, q_s14_r);
795  q_out3_r = vsubq_s32 (q_s11_r, q_s15_i);
796  q_out3_i = vaddq_s32 (q_s11_i, q_s15_r);
797  q_out6_r = vaddq_s32 (q_s10_r, q_s14_i);
798  q_out6_i = vsubq_s32 (q_s10_i, q_s14_r);
799  q_out7_r = vaddq_s32 (q_s11_r, q_s15_i);
800  q_out7_i = vsubq_s32 (q_s11_i, q_s15_r);
801 
803  } // f_count
804 }
805 static inline void ne10_radix8x4_forward_scaled_neon (ne10_fft_cpx_int32_t * Fout,
806  ne10_fft_cpx_int32_t * Fin,
807  ne10_int32_t stride)
808 {
810 
811  for (f_count = 0; f_count < stride; f_count += 4)
812  {
815 
816  // radix 4 butterfly without twiddles
819 
820  q_out2_r = vhaddq_s32 (q_s10_r, q_s14_i);
821  q_out2_i = vhsubq_s32 (q_s10_i, q_s14_r);
822  q_out3_r = vhaddq_s32 (q_s11_r, q_s15_i);
823  q_out3_i = vhsubq_s32 (q_s11_i, q_s15_r);
824  q_out6_r = vhsubq_s32 (q_s10_r, q_s14_i);
825  q_out6_i = vhaddq_s32 (q_s10_i, q_s14_r);
826  q_out7_r = vhsubq_s32 (q_s11_r, q_s15_i);
827  q_out7_i = vhaddq_s32 (q_s11_i, q_s15_r);
828 
830  } // f_count
831 }
832 
833 static inline void ne10_radix8x4_backward_scaled_neon (ne10_fft_cpx_int32_t * Fout,
834  ne10_fft_cpx_int32_t * Fin,
835  ne10_int32_t stride)
836 {
838 
839  for (f_count = 0; f_count < stride; f_count += 4)
840  {
843 
844  // radix 4 butterfly without twiddles
847 
848  q_out2_r = vhsubq_s32 (q_s10_r, q_s14_i);
849  q_out2_i = vhaddq_s32 (q_s10_i, q_s14_r);
850  q_out3_r = vhsubq_s32 (q_s11_r, q_s15_i);
851  q_out3_i = vhaddq_s32 (q_s11_i, q_s15_r);
852  q_out6_r = vhaddq_s32 (q_s10_r, q_s14_i);
853  q_out6_i = vhsubq_s32 (q_s10_i, q_s14_r);
854  q_out7_r = vhaddq_s32 (q_s11_r, q_s15_i);
855  q_out7_i = vhsubq_s32 (q_s11_i, q_s15_r);
856 
858  } // f_count
859 }
860 
861 #define RADIX4x4_WITHOUT_TW_START \
862  ne10_int32_t f_count; \
863  ne10_int32_t src_step = stride << 1; \
864  int32_t *p_src, *p_dst; \
865  int32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3; \
866  int32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i, q_s3_r, q_s3_i; \
867  int32x4_t q_out0_r, q_out0_i, q_out1_r, q_out1_i, q_out2_r, q_out2_i, q_out3_r, q_out3_i; \
868  int32x4x2_t q2_tmp0, q2_tmp1, q2_tmp2, q2_tmp3; \
869  int32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3; \
870  p_src = (int32_t *) Fin; \
871  p_dst = (int32_t *) Fout;
872 
873 #define RADIX4x4_WITHOUT_TW_LOAD \
874  q2_in0 = vld2q_s32 (p_src); \
875  p_src += src_step; \
876  q2_in1 = vld2q_s32 (p_src); \
877  p_src += src_step; \
878  q2_in2 = vld2q_s32 (p_src); \
879  p_src += src_step; \
880  q2_in3 = vld2q_s32 (p_src); \
881  p_src += src_step;
882 
883 #define RADIX4x4_WITHOUT_TW_STORE \
884  q2_tmp0 = vtrnq_s32 (q_out0_r, q_out1_r); \
885  q2_tmp1 = vtrnq_s32 (q_out0_i, q_out1_i); \
886  q2_tmp2 = vtrnq_s32 (q_out2_r, q_out3_r); \
887  q2_tmp3 = vtrnq_s32 (q_out2_i, q_out3_i); \
888  q2_out0.val[0] = vcombine_s32 (vget_low_s32 (q2_tmp0.val[0]), vget_low_s32 (q2_tmp2.val[0])); \
889  q2_out0.val[1] = vcombine_s32 (vget_low_s32 (q2_tmp1.val[0]), vget_low_s32 (q2_tmp3.val[0])); \
890  q2_out1.val[0] = vcombine_s32 (vget_low_s32 (q2_tmp0.val[1]), vget_low_s32 (q2_tmp2.val[1])); \
891  q2_out1.val[1] = vcombine_s32 (vget_low_s32 (q2_tmp1.val[1]), vget_low_s32 (q2_tmp3.val[1])); \
892  q2_out2.val[0] = vcombine_s32 (vget_high_s32 (q2_tmp0.val[0]), vget_high_s32 (q2_tmp2.val[0])); \
893  q2_out2.val[1] = vcombine_s32 (vget_high_s32 (q2_tmp1.val[0]), vget_high_s32 (q2_tmp3.val[0])); \
894  q2_out3.val[0] = vcombine_s32 (vget_high_s32 (q2_tmp0.val[1]), vget_high_s32 (q2_tmp2.val[1])); \
895  q2_out3.val[1] = vcombine_s32 (vget_high_s32 (q2_tmp1.val[1]), vget_high_s32 (q2_tmp3.val[1])); \
896  vst2q_s32 (p_dst, q2_out0); \
897  p_dst += 8; \
898  vst2q_s32 (p_dst, q2_out1); \
899  p_dst += 8; \
900  vst2q_s32 (p_dst, q2_out2); \
901  p_dst += 8; \
902  vst2q_s32 (p_dst, q2_out3); \
903  p_dst += 8; \
904  p_src = p_src - src_step * 4 + 8;
905 
906 #define RADIX4x4_WITHOUT_TW_S0 \
907  q_s0_r = vaddq_s32 (q2_in0.val[0], q2_in2.val[0]); \
908  q_s0_i = vaddq_s32 (q2_in0.val[1], q2_in2.val[1]); \
909  q_s1_r = vsubq_s32 (q2_in0.val[0], q2_in2.val[0]); \
910  q_s1_i = vsubq_s32 (q2_in0.val[1], q2_in2.val[1]); \
911  q_s2_r = vaddq_s32 (q2_in1.val[0], q2_in3.val[0]); \
912  q_s2_i = vaddq_s32 (q2_in1.val[1], q2_in3.val[1]); \
913  q_s3_r = vsubq_s32 (q2_in1.val[0], q2_in3.val[0]); \
914  q_s3_i = vsubq_s32 (q2_in1.val[1], q2_in3.val[1]); \
915  q_out2_r = vsubq_s32 (q_s0_r, q_s2_r); \
916  q_out2_i = vsubq_s32 (q_s0_i, q_s2_i); \
917  q_out0_r = vaddq_s32 (q_s0_r, q_s2_r); \
918  q_out0_i = vaddq_s32 (q_s0_i, q_s2_i);
919 
920 #define RADIX4x4_WITHOUT_TW_S0_SCALED \
921  q_s0_r = vhaddq_s32 (q2_in0.val[0], q2_in2.val[0]); \
922  q_s0_i = vhaddq_s32 (q2_in0.val[1], q2_in2.val[1]); \
923  q_s1_r = vhsubq_s32 (q2_in0.val[0], q2_in2.val[0]); \
924  q_s1_i = vhsubq_s32 (q2_in0.val[1], q2_in2.val[1]); \
925  q_s2_r = vhaddq_s32 (q2_in1.val[0], q2_in3.val[0]); \
926  q_s2_i = vhaddq_s32 (q2_in1.val[1], q2_in3.val[1]); \
927  q_s3_r = vhsubq_s32 (q2_in1.val[0], q2_in3.val[0]); \
928  q_s3_i = vhsubq_s32 (q2_in1.val[1], q2_in3.val[1]); \
929  q_out2_r = vhsubq_s32 (q_s0_r, q_s2_r); \
930  q_out2_i = vhsubq_s32 (q_s0_i, q_s2_i); \
931  q_out0_r = vhaddq_s32 (q_s0_r, q_s2_r); \
932  q_out0_i = vhaddq_s32 (q_s0_i, q_s2_i);
933 
934 
935 static inline void ne10_radix4x4_without_twiddles_forward_unscaled_neon (ne10_fft_cpx_int32_t * Fout,
936  ne10_fft_cpx_int32_t * Fin,
937  ne10_int32_t stride)
938 {
940 
941  for (f_count = 0; f_count < stride; f_count += 4)
942  {
943  // load
945 
946  // radix 4 butterfly without twiddles
948 
949  q_out1_r = vaddq_s32 (q_s1_r, q_s3_i);
950  q_out1_i = vsubq_s32 (q_s1_i, q_s3_r);
951  q_out3_r = vsubq_s32 (q_s1_r, q_s3_i);
952  q_out3_i = vaddq_s32 (q_s1_i, q_s3_r);
953 
955  }
956 }
957 
958 static inline void ne10_radix4x4_without_twiddles_backward_unscaled_neon (ne10_fft_cpx_int32_t * Fout,
959  ne10_fft_cpx_int32_t * Fin,
960  ne10_int32_t stride)
961 {
963 
964  for (f_count = 0; f_count < stride; f_count += 4)
965  {
966  // load
968 
969  // radix 4 butterfly without twiddles
971 
972  q_out1_r = vsubq_s32 (q_s1_r, q_s3_i);
973  q_out1_i = vaddq_s32 (q_s1_i, q_s3_r);
974  q_out3_r = vaddq_s32 (q_s1_r, q_s3_i);
975  q_out3_i = vsubq_s32 (q_s1_i, q_s3_r);
976 
978  }
979 }
980 
981 static inline void ne10_radix4x4_without_twiddles_forward_scaled_neon (ne10_fft_cpx_int32_t * Fout,
982  ne10_fft_cpx_int32_t * Fin,
983  ne10_int32_t stride)
984 {
986 
987  for (f_count = 0; f_count < stride; f_count += 4)
988  {
989  // load
991 
992  // radix 4 butterfly without twiddles
994 
995  q_out1_r = vhaddq_s32 (q_s1_r, q_s3_i);
996  q_out1_i = vhsubq_s32 (q_s1_i, q_s3_r);
997  q_out3_r = vhsubq_s32 (q_s1_r, q_s3_i);
998  q_out3_i = vhaddq_s32 (q_s1_i, q_s3_r);
999 
1001  }
1002 }
1003 
1004 static inline void ne10_radix4x4_without_twiddles_backward_scaled_neon (ne10_fft_cpx_int32_t * Fout,
1005  ne10_fft_cpx_int32_t * Fin,
1006  ne10_int32_t stride)
1007 {
1009 
1010  for (f_count = 0; f_count < stride; f_count += 4)
1011  {
1012  // load
1014 
1015  // radix 4 butterfly without twiddles
1017 
1018  q_out1_r = vhsubq_s32 (q_s1_r, q_s3_i);
1019  q_out1_i = vhaddq_s32 (q_s1_i, q_s3_r);
1020  q_out3_r = vhaddq_s32 (q_s1_r, q_s3_i);
1021  q_out3_i = vhsubq_s32 (q_s1_i, q_s3_r);
1022 
1024  }
1025 }
1026 
1027 #define RADIX4x4_WITH_TW_START \
1028  ne10_int32_t m_count; \
1029  ne10_int32_t src_step = src_stride << 1; \
1030  ne10_int32_t dst_step = dst_stride << 1; \
1031  ne10_int32_t tw_step = mstride << 1; \
1032  int32_t *p_src, *p_dst, *p_tw; \
1033  int32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3; \
1034  int32x4x2_t q2_tw0, q2_tw1, q2_tw2; \
1035  int32x4_t q_s1_r, q_s1_i, q_s2_r, q_s2_i, q_s3_r, q_s3_i; \
1036  int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3, q_tmp4, q_tmp5; \
1037  int32x4_t q_s4_r, q_s4_i, q_s5_r, q_s5_i, q_s6_r, q_s6_i, q_s7_r, q_s7_i; \
1038  int32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3; \
1039  p_src = (int32_t *) Fin; \
1040  p_dst = (int32_t *) Fout; \
1041  p_tw = (int32_t *) tw;
1042 
1043 #define RADIX4x4_WITH_TW_LOAD \
1044  q2_in0 = vld2q_s32 (p_src); \
1045  p_src += src_step; \
1046  q2_in1 = vld2q_s32 (p_src); \
1047  p_src += src_step; \
1048  q2_in2 = vld2q_s32 (p_src); \
1049  p_src += src_step; \
1050  q2_in3 = vld2q_s32 (p_src); \
1051  p_src += src_step; \
1052  q2_tw0 = vld2q_s32 (p_tw); \
1053  p_tw += tw_step; \
1054  q2_tw1 = vld2q_s32 (p_tw); \
1055  p_tw += tw_step; \
1056  q2_tw2 = vld2q_s32 (p_tw); \
1057  q_s1_r = vqdmulhq_s32 (q2_in1.val[0], q2_tw0.val[0]); \
1058  q_s1_i = vqdmulhq_s32 (q2_in1.val[1], q2_tw0.val[0]); \
1059  q_s2_r = vqdmulhq_s32 (q2_in2.val[0], q2_tw1.val[0]); \
1060  q_s2_i = vqdmulhq_s32 (q2_in2.val[1], q2_tw1.val[0]); \
1061  q_s3_r = vqdmulhq_s32 (q2_in3.val[0], q2_tw2.val[0]); \
1062  q_s3_i = vqdmulhq_s32 (q2_in3.val[1], q2_tw2.val[0]); \
1063  q_tmp0 = vqdmulhq_s32 (q2_in1.val[1], q2_tw0.val[1]); \
1064  q_tmp1 = vqdmulhq_s32 (q2_in1.val[0], q2_tw0.val[1]); \
1065  q_tmp2 = vqdmulhq_s32 (q2_in2.val[1], q2_tw1.val[1]); \
1066  q_tmp3 = vqdmulhq_s32 (q2_in2.val[0], q2_tw1.val[1]); \
1067  q_tmp4 = vqdmulhq_s32 (q2_in3.val[1], q2_tw2.val[1]); \
1068  q_tmp5 = vqdmulhq_s32 (q2_in3.val[0], q2_tw2.val[1]);
1069 
1070 #define RADIX4x4_WITH_TW_STORE \
1071  vst2q_s32 (p_dst, q2_out0); \
1072  p_dst += dst_step; \
1073  vst2q_s32 (p_dst, q2_out1); \
1074  p_dst += dst_step; \
1075  vst2q_s32 (p_dst, q2_out2); \
1076  p_dst += dst_step; \
1077  vst2q_s32 (p_dst, q2_out3); \
1078  p_dst += dst_step; \
1079  p_src = p_src - src_step * 4 + 8; \
1080  p_dst = p_dst - dst_step * 4 + 8; \
1081  p_tw = p_tw - tw_step * 2 + 8;
1082 
1083 #define RADIX4x4_WITH_TW_S1_FWD \
1084  q_s1_r = vsubq_s32 (q_s1_r, q_tmp0); \
1085  q_s1_i = vaddq_s32 (q_s1_i, q_tmp1); \
1086  q_s2_r = vsubq_s32 (q_s2_r, q_tmp2); \
1087  q_s2_i = vaddq_s32 (q_s2_i, q_tmp3); \
1088  q_s3_r = vsubq_s32 (q_s3_r, q_tmp4); \
1089  q_s3_i = vaddq_s32 (q_s3_i, q_tmp5);
1090 
1091 #define RADIX4x4_WITH_TW_S1_INV \
1092  q_s1_r = vaddq_s32 (q_s1_r, q_tmp0); \
1093  q_s1_i = vsubq_s32 (q_s1_i, q_tmp1); \
1094  q_s2_r = vaddq_s32 (q_s2_r, q_tmp2); \
1095  q_s2_i = vsubq_s32 (q_s2_i, q_tmp3); \
1096  q_s3_r = vaddq_s32 (q_s3_r, q_tmp4); \
1097  q_s3_i = vsubq_s32 (q_s3_i, q_tmp5);
1098 
1099 
1100 #define RADIX4x4_WITH_TW_LS_02 \
1101  q_s4_r = vaddq_s32 (q2_in0.val[0], q_s2_r); \
1102  q_s4_i = vaddq_s32 (q2_in0.val[1], q_s2_i); \
1103  q_s5_r = vsubq_s32 (q2_in0.val[0], q_s2_r); \
1104  q_s5_i = vsubq_s32 (q2_in0.val[1], q_s2_i); \
1105  q_s6_r = vaddq_s32 (q_s1_r, q_s3_r); \
1106  q_s6_i = vaddq_s32 (q_s1_i, q_s3_i); \
1107  q_s7_r = vsubq_s32 (q_s1_r, q_s3_r); \
1108  q_s7_i = vsubq_s32 (q_s1_i, q_s3_i); \
1109  q2_out2.val[0] = vsubq_s32 (q_s4_r, q_s6_r); \
1110  q2_out2.val[1] = vsubq_s32 (q_s4_i, q_s6_i); \
1111  q2_out0.val[0] = vaddq_s32 (q_s4_r, q_s6_r); \
1112  q2_out0.val[1] = vaddq_s32 (q_s4_i, q_s6_i);
1113 
1114 #define RADIX4x4_WITH_TW_LS_02_SCALED \
1115  q_s4_r = vhaddq_s32 (q2_in0.val[0], q_s2_r); \
1116  q_s4_i = vhaddq_s32 (q2_in0.val[1], q_s2_i); \
1117  q_s5_r = vhsubq_s32 (q2_in0.val[0], q_s2_r); \
1118  q_s5_i = vhsubq_s32 (q2_in0.val[1], q_s2_i); \
1119  q_s6_r = vhaddq_s32 (q_s1_r, q_s3_r); \
1120  q_s6_i = vhaddq_s32 (q_s1_i, q_s3_i); \
1121  q_s7_r = vhsubq_s32 (q_s1_r, q_s3_r); \
1122  q_s7_i = vhsubq_s32 (q_s1_i, q_s3_i); \
1123  q2_out2.val[0] = vhsubq_s32 (q_s4_r, q_s6_r); \
1124  q2_out2.val[1] = vhsubq_s32 (q_s4_i, q_s6_i); \
1125  q2_out0.val[0] = vhaddq_s32 (q_s4_r, q_s6_r); \
1126  q2_out0.val[1] = vhaddq_s32 (q_s4_i, q_s6_i);
1127 
1128 
1129 static inline void ne10_radix4x4_with_twiddles_forward_unscaled_neon (ne10_fft_cpx_int32_t * Fout,
1130  ne10_fft_cpx_int32_t * Fin,
1131  ne10_fft_cpx_int32_t * tw,
1132  ne10_int32_t src_stride,
1133  ne10_int32_t dst_stride,
1134  ne10_int32_t mstride)
1135 {
1137 
1138  for (m_count = 0; m_count < mstride; m_count += 4)
1139  {
1140  // load
1143 
1145 
1146  q2_out1.val[0] = vaddq_s32 (q_s5_r, q_s7_i);
1147  q2_out1.val[1] = vsubq_s32 (q_s5_i, q_s7_r);
1148  q2_out3.val[0] = vsubq_s32 (q_s5_r, q_s7_i);
1149  q2_out3.val[1] = vaddq_s32 (q_s5_i, q_s7_r);
1150 
1151  // store
1153  }
1154 }
1155 
1156 
1157 static inline void ne10_radix4x4_with_twiddles_backward_unscaled_neon (ne10_fft_cpx_int32_t * Fout,
1158  ne10_fft_cpx_int32_t * Fin,
1159  ne10_fft_cpx_int32_t * tw,
1160  ne10_int32_t src_stride,
1161  ne10_int32_t dst_stride,
1162  ne10_int32_t mstride)
1163 {
1165 
1166  for (m_count = 0; m_count < mstride; m_count += 4)
1167  {
1168  // load
1171 
1173 
1174  q2_out1.val[0] = vsubq_s32 (q_s5_r, q_s7_i);
1175  q2_out1.val[1] = vaddq_s32 (q_s5_i, q_s7_r);
1176  q2_out3.val[0] = vaddq_s32 (q_s5_r, q_s7_i);
1177  q2_out3.val[1] = vsubq_s32 (q_s5_i, q_s7_r);
1178 
1179  // store
1181  }
1182 }
1183 
1184 
1185 
1186 static inline void ne10_radix4x4_with_twiddles_forward_scaled_neon (ne10_fft_cpx_int32_t * Fout,
1187  ne10_fft_cpx_int32_t * Fin,
1188  ne10_fft_cpx_int32_t * tw,
1189  ne10_int32_t src_stride,
1190  ne10_int32_t dst_stride,
1191  ne10_int32_t mstride)
1192 {
1194 
1195  for (m_count = 0; m_count < mstride; m_count += 4)
1196  {
1197  // load
1200 
1202 
1203  q2_out1.val[0] = vhaddq_s32 (q_s5_r, q_s7_i);
1204  q2_out1.val[1] = vhsubq_s32 (q_s5_i, q_s7_r);
1205  q2_out3.val[0] = vhsubq_s32 (q_s5_r, q_s7_i);
1206  q2_out3.val[1] = vhaddq_s32 (q_s5_i, q_s7_r);
1207 
1208  // store
1210  }
1211 }
1212 
1213 static inline void ne10_radix4x4_with_twiddles_backward_scaled_neon (ne10_fft_cpx_int32_t * Fout,
1214  ne10_fft_cpx_int32_t * Fin,
1215  ne10_fft_cpx_int32_t * tw,
1216  ne10_int32_t src_stride,
1217  ne10_int32_t dst_stride,
1218  ne10_int32_t mstride)
1219 {
1221 
1222  for (m_count = 0; m_count < mstride; m_count += 4)
1223  {
1224  // load
1227 
1229 
1230  q2_out1.val[0] = vhsubq_s32 (q_s5_r, q_s7_i);
1231  q2_out1.val[1] = vhaddq_s32 (q_s5_i, q_s7_r);
1232  q2_out3.val[0] = vhaddq_s32 (q_s5_r, q_s7_i);
1233  q2_out3.val[1] = vhsubq_s32 (q_s5_i, q_s7_r);
1234 
1235  // store
1237  }
1238 }
1239 
1240 #define ne10_mixed_radix_fft_forward_int32_neon(scaled) \
1241 void ne10_mixed_radix_fft_forward_int32_##scaled##_neon (ne10_fft_cpx_int32_t * Fout, \
1242  ne10_fft_cpx_int32_t * Fin, \
1243  ne10_int32_t * factors, \
1244  ne10_fft_cpx_int32_t * twiddles, \
1245  ne10_fft_cpx_int32_t * buffer) \
1246 { \
1247  ne10_int32_t fstride, mstride, N; \
1248  ne10_int32_t fstride1; \
1249  ne10_int32_t f_count; \
1250  ne10_int32_t stage_count; \
1251  \
1252  ne10_fft_cpx_int32_t *Fin1, *Fout1; \
1253  ne10_fft_cpx_int32_t *Fout_ls = Fout; \
1254  ne10_fft_cpx_int32_t *Ftmp; \
1255  ne10_fft_cpx_int32_t *tw, *tw1; \
1256  \
1257  /* init fstride, mstride, N */ \
1258  stage_count = factors[0]; \
1259  fstride = factors[1]; \
1260  mstride = factors[ (stage_count << 1) - 1 ]; \
1261  N = factors[ stage_count << 1 ]; \
1262  \
1263  /* the first stage */ \
1264  Fin1 = Fin; \
1265  Fout1 = Fout; \
1266  if (N == 8) \
1267  { \
1268  N = fstride << 1;\
1269  tw = twiddles; \
1270  ne10_radix8x4_forward_##scaled##_neon (Fout, Fin, fstride);\
1271  \
1272  fstride >>= 2; \
1273  stage_count--; \
1274  \
1275  Ftmp = buffer; \
1276  buffer = Fout; \
1277  Fout = Ftmp; \
1278  } \
1279  else if (N == 4) \
1280  { \
1281  ne10_radix4x4_without_twiddles_forward_##scaled##_neon (Fout, Fin, fstride); \
1282  N = fstride; \
1283  Ftmp = buffer; \
1284  buffer = Fout; \
1285  Fout = Ftmp; \
1286  /* update address for other stages*/ \
1287  stage_count--; \
1288  tw = twiddles; \
1289  fstride >>= 2; \
1290  } \
1291  /* others but the last one*/ \
1292  for (; stage_count > 1 ; stage_count--) \
1293  { \
1294  Fin1 = buffer; \
1295  for (f_count = 0; f_count < fstride; f_count ++) \
1296  { \
1297  Fout1 = & Fout[ f_count * mstride << 2 ]; \
1298  tw1 = tw; \
1299  ne10_radix4x4_with_twiddles_forward_##scaled##_neon (Fout1, Fin1, tw1, N, mstride, mstride); \
1300  Fin1 += mstride; \
1301  } \
1302  tw += mstride * 3; \
1303  mstride <<= 2; \
1304  Ftmp = buffer; \
1305  buffer = Fout; \
1306  Fout = Ftmp; \
1307  fstride >>= 2; \
1308  }\
1309  /* the last one*/ \
1310  if (stage_count) \
1311  { \
1312  Fin1 = buffer; \
1313  Fout1 = Fout_ls; \
1314  for (f_count = 0; f_count < fstride; f_count ++) \
1315  { \
1316  tw1 = tw; \
1317  ne10_radix4x4_with_twiddles_forward_##scaled##_neon (Fout1, Fin1, tw1, N, N, mstride); \
1318  Fin1 += mstride; \
1319  Fout1 += mstride; \
1320  } \
1321  } \
1322 }
1323 
1324 #define ne10_mixed_radix_fft_backward_int32_neon(scaled) \
1325 void ne10_mixed_radix_fft_backward_int32_##scaled##_neon (ne10_fft_cpx_int32_t * Fout, \
1326  ne10_fft_cpx_int32_t * Fin, \
1327  ne10_int32_t * factors, \
1328  ne10_fft_cpx_int32_t * twiddles, \
1329  ne10_fft_cpx_int32_t * buffer) \
1330 { \
1331  ne10_int32_t fstride, mstride, N; \
1332  ne10_int32_t fstride1; \
1333  ne10_int32_t f_count; \
1334  ne10_int32_t stage_count; \
1335  \
1336  ne10_fft_cpx_int32_t *Fin1, *Fout1; \
1337  ne10_fft_cpx_int32_t *Fout_ls = Fout; \
1338  ne10_fft_cpx_int32_t *Ftmp; \
1339  ne10_fft_cpx_int32_t *tw, *tw1; \
1340  \
1341  /* init fstride, mstride, N */ \
1342  stage_count = factors[0]; \
1343  fstride = factors[1]; \
1344  mstride = factors[ (stage_count << 1) - 1 ]; \
1345  N = factors[ stage_count << 1 ]; \
1346  \
1347  /* the first stage */ \
1348  Fin1 = Fin; \
1349  Fout1 = Fout; \
1350  if (N == 8) \
1351  { \
1352  N = fstride << 1;\
1353  tw = twiddles; \
1354  ne10_radix8x4_backward_##scaled##_neon (Fout, Fin, fstride);\
1355  \
1356  fstride >>= 2; \
1357  stage_count--; \
1358  \
1359  Ftmp = buffer; \
1360  buffer = Fout; \
1361  Fout = Ftmp; \
1362  } \
1363  else if (N == 4) \
1364  { \
1365  ne10_radix4x4_without_twiddles_backward_##scaled##_neon (Fout, Fin, fstride); \
1366  N = fstride; \
1367  Ftmp = buffer; \
1368  buffer = Fout; \
1369  Fout = Ftmp; \
1370  /* update address for other stages*/ \
1371  stage_count--; \
1372  tw = twiddles; \
1373  fstride >>= 2; \
1374  } \
1375  /* others but the last one*/ \
1376  for (; stage_count > 1 ; stage_count--) \
1377  { \
1378  Fin1 = buffer; \
1379  for (f_count = 0; f_count < fstride; f_count ++) \
1380  { \
1381  Fout1 = & Fout[ f_count * mstride << 2 ]; \
1382  tw1 = tw; \
1383  ne10_radix4x4_with_twiddles_backward_##scaled##_neon (Fout1, Fin1, tw1, N, mstride, mstride); \
1384  Fin1 += mstride; \
1385  } \
1386  tw += mstride * 3; \
1387  mstride <<= 2; \
1388  Ftmp = buffer; \
1389  buffer = Fout; \
1390  Fout = Ftmp; \
1391  fstride >>= 2; \
1392  }\
1393  /* the last one*/ \
1394  if (stage_count) \
1395  { \
1396  Fin1 = buffer; \
1397  Fout1 = Fout_ls; \
1398  for (f_count = 0; f_count < fstride; f_count ++) \
1399  { \
1400  tw1 = tw; \
1401  ne10_radix4x4_with_twiddles_backward_##scaled##_neon (Fout1, Fin1, tw1, N, N, mstride); \
1402  Fin1 += mstride; \
1403  Fout1 += mstride; \
1404  } \
1405  } \
1406 }
1407 
1412 
1413 
1414 static void ne10_fft_split_r2c_1d_int32_neon (ne10_fft_cpx_int32_t *dst,
1415  const ne10_fft_cpx_int32_t *src,
1416  ne10_fft_cpx_int32_t *twiddles,
1417  ne10_int32_t ncfft,
1418  ne10_int32_t scaled_flag)
1419 {
1420  ne10_int32_t k;
1421  ne10_int32_t count = ncfft / 2;
1422  ne10_fft_cpx_int32_t fpnk, fpk, f1k, f2k, tw, tdc;
1423  int32x4x2_t q2_fpk, q2_fpnk, q2_tw, q2_dst, q2_dst2;
1424  int32x4_t q_fpnk_r, q_fpnk_i;
1425  int32x4_t q_f1k_r, q_f1k_i, q_f2k_r, q_f2k_i;
1426  int32x4_t q_tw_r, q_tw_i;
1427  int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
1428  int32x4_t q_dst2_r, q_dst2_i;
1429  int32_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
1430 
1431  tdc.r = src[0].r;
1432  tdc.i = src[0].i;
1433 
1434  if (scaled_flag)
1435  NE10_F2I32_FIXDIV (tdc, 2);
1436 
1437  dst[0].r = tdc.r + tdc.i;
1438  dst[ncfft].r = tdc.r - tdc.i;
1439  dst[ncfft].i = dst[0].i = 0;
1440  if (count >= 4)
1441  {
1442 
1443  if (scaled_flag)
1444  {
1445  for (k = 1; k <= count ; k += 4)
1446  {
1447  p_src = (int32_t*) (& (src[k]));
1448  p_src2 = (int32_t*) (& (src[ncfft - k - 3]));
1449  p_twiddles = (int32_t*) (& (twiddles[k - 1]));
1450  p_dst = (int32_t*) (& (dst[k]));
1451  p_dst2 = (int32_t*) (& (dst[ncfft - k - 3]));
1452 
1453  q2_fpk = vld2q_s32 (p_src);
1454  q2_fpnk = vld2q_s32 (p_src2);
1455 
1456  q2_tw = vld2q_s32 (p_twiddles);
1457  q2_fpnk.val[0] = vrev64q_s32 (q2_fpnk.val[0]);
1458  q2_fpnk.val[1] = vrev64q_s32 (q2_fpnk.val[1]);
1459  q_fpnk_r = vcombine_s32 (vget_high_s32 (q2_fpnk.val[0]), vget_low_s32 (q2_fpnk.val[0]));
1460  q_fpnk_i = vcombine_s32 (vget_high_s32 (q2_fpnk.val[1]), vget_low_s32 (q2_fpnk.val[1]));
1461  q_fpnk_i = vnegq_s32 (q_fpnk_i);
1462 
1463  q_f1k_r = vhaddq_s32 (q2_fpk.val[0], q_fpnk_r);
1464  q_f1k_i = vhaddq_s32 (q2_fpk.val[1], q_fpnk_i);
1465 
1466  q_f2k_r = vhsubq_s32 (q2_fpk.val[0], q_fpnk_r);
1467  q_f2k_i = vhsubq_s32 (q2_fpk.val[1], q_fpnk_i);
1468 
1469  q_tmp0 = vqdmulhq_s32 (q_f2k_r, q2_tw.val[0]);
1470  q_tmp1 = vqdmulhq_s32 (q_f2k_i, q2_tw.val[1]);
1471  q_tmp2 = vqdmulhq_s32 (q_f2k_r, q2_tw.val[1]);
1472  q_tmp3 = vqdmulhq_s32 (q_f2k_i, q2_tw.val[0]);
1473  q_tw_r = vsubq_s32 (q_tmp0, q_tmp1);
1474  q_tw_i = vaddq_s32 (q_tmp2, q_tmp3);
1475 
1476  q_dst2_r = vhsubq_s32 (q_f1k_r, q_tw_r);
1477  q_dst2_i = vhsubq_s32 (q_tw_i, q_f1k_i);
1478  q2_dst.val[0] = vhaddq_s32 (q_f1k_r, q_tw_r);
1479  q2_dst.val[1] = vhaddq_s32 (q_f1k_i, q_tw_i);
1480  q_dst2_r = vrev64q_s32 (q_dst2_r);
1481  q_dst2_i = vrev64q_s32 (q_dst2_i);
1482  q2_dst2.val[0] = vcombine_s32 (vget_high_s32 (q_dst2_r), vget_low_s32 (q_dst2_r));
1483  q2_dst2.val[1] = vcombine_s32 (vget_high_s32 (q_dst2_i), vget_low_s32 (q_dst2_i));
1484  vst2q_s32 (p_dst, q2_dst);
1485  vst2q_s32 (p_dst2, q2_dst2);
1486 
1487  }
1488  }
1489  else
1490  {
1491  for (k = 1; k <= count ; k += 4)
1492  {
1493  p_src = (int32_t*) (& (src[k]));
1494  p_src2 = (int32_t*) (& (src[ncfft - k - 3]));
1495  p_twiddles = (int32_t*) (& (twiddles[k - 1]));
1496  p_dst = (int32_t*) (& (dst[k]));
1497  p_dst2 = (int32_t*) (& (dst[ncfft - k - 3]));
1498 
1499  q2_fpk = vld2q_s32 (p_src);
1500  q2_fpnk = vld2q_s32 (p_src2);
1501 
1502  q2_tw = vld2q_s32 (p_twiddles);
1503  q2_fpnk.val[0] = vrev64q_s32 (q2_fpnk.val[0]);
1504  q2_fpnk.val[1] = vrev64q_s32 (q2_fpnk.val[1]);
1505  q_fpnk_r = vcombine_s32 (vget_high_s32 (q2_fpnk.val[0]), vget_low_s32 (q2_fpnk.val[0]));
1506  q_fpnk_i = vcombine_s32 (vget_high_s32 (q2_fpnk.val[1]), vget_low_s32 (q2_fpnk.val[1]));
1507  q_fpnk_i = vnegq_s32 (q_fpnk_i);
1508 
1509  q_f1k_r = vaddq_s32 (q2_fpk.val[0], q_fpnk_r);
1510  q_f1k_i = vaddq_s32 (q2_fpk.val[1], q_fpnk_i);
1511 
1512  q_f2k_r = vsubq_s32 (q2_fpk.val[0], q_fpnk_r);
1513  q_f2k_i = vsubq_s32 (q2_fpk.val[1], q_fpnk_i);
1514 
1515  q_tmp0 = vqdmulhq_s32 (q_f2k_r, q2_tw.val[0]);
1516  q_tmp1 = vqdmulhq_s32 (q_f2k_i, q2_tw.val[1]);
1517  q_tmp2 = vqdmulhq_s32 (q_f2k_r, q2_tw.val[1]);
1518  q_tmp3 = vqdmulhq_s32 (q_f2k_i, q2_tw.val[0]);
1519  q_tw_r = vsubq_s32 (q_tmp0, q_tmp1);
1520  q_tw_i = vaddq_s32 (q_tmp2, q_tmp3);
1521 
1522  q_dst2_r = vhsubq_s32 (q_f1k_r, q_tw_r);
1523  q_dst2_i = vhsubq_s32 (q_tw_i, q_f1k_i);
1524  q2_dst.val[0] = vhaddq_s32 (q_f1k_r, q_tw_r);
1525  q2_dst.val[1] = vhaddq_s32 (q_f1k_i, q_tw_i);
1526  q_dst2_r = vrev64q_s32 (q_dst2_r);
1527  q_dst2_i = vrev64q_s32 (q_dst2_i);
1528  q2_dst2.val[0] = vcombine_s32 (vget_high_s32 (q_dst2_r), vget_low_s32 (q_dst2_r));
1529  q2_dst2.val[1] = vcombine_s32 (vget_high_s32 (q_dst2_i), vget_low_s32 (q_dst2_i));
1530  vst2q_s32 (p_dst, q2_dst);
1531  vst2q_s32 (p_dst2, q2_dst2);
1532 
1533  }
1534  }
1535  }
1536  else
1537  {
1538 
1539  for (k = 1; k <= ncfft / 2 ; ++k)
1540  {
1541  fpk = src[k];
1542  fpnk.r = src[ncfft - k].r;
1543  fpnk.i = - src[ncfft - k].i;
1544  if (scaled_flag)
1545  {
1546  NE10_F2I32_FIXDIV (fpk, 2);
1547  NE10_F2I32_FIXDIV (fpnk, 2);
1548  }
1549 
1550  f1k.r = fpk.r + fpnk.r;
1551  f1k.i = fpk.i + fpnk.i;
1552 
1553  f2k.r = fpk.r - fpnk.r;
1554  f2k.i = fpk.i - fpnk.i;
1555 
1556  tw.r = ( ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) f2k.r * (twiddles[k - 1]).r) >> 32)) - ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) f2k.i * (twiddles[k - 1]).i) >> 32))) << 1;
1557  tw.i = ( ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) f2k.r * (twiddles[k - 1]).i) >> 32)) + ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) f2k.i * (twiddles[k - 1]).r) >> 32))) << 1;
1558 
1559  dst[k].r = (f1k.r + tw.r) >> 1;
1560  dst[k].i = (f1k.i + tw.i) >> 1;
1561  dst[ncfft - k].r = (f1k.r - tw.r) >> 1;
1562  dst[ncfft - k].i = (tw.i - f1k.i) >> 1;
1563  }
1564  }
1565 }
1566 
1567 static void ne10_fft_split_c2r_1d_int32_neon (ne10_fft_cpx_int32_t *dst,
1568  const ne10_fft_cpx_int32_t *src,
1569  ne10_fft_cpx_int32_t *twiddles,
1570  ne10_int32_t ncfft,
1571  ne10_int32_t scaled_flag)
1572 {
1573 
1574  ne10_int32_t k;
1575  ne10_int32_t count = ncfft / 2;
1576  ne10_fft_cpx_int32_t fk, fnkc, fek, fok, tmp;
1577  int32x4x2_t q2_fk, q2_fnkc, q2_tw, q2_dst, q2_dst2;
1578  int32x4_t q_fnkc_r, q_fnkc_i;
1579  int32x4_t q_fek_r, q_fek_i, q_fok_r, q_fok_i;
1580  int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
1581  int32x4_t q_dst2_r, q_dst2_i;
1582  int32_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
1583 
1584 
1585  dst[0].r = src[0].r + src[ncfft].r;
1586  dst[0].i = src[0].r - src[ncfft].r;
1587  if (scaled_flag)
1588  NE10_F2I32_FIXDIV (dst[0], 2);
1589  if (count >= 4)
1590  {
1591  if (scaled_flag)
1592  {
1593  for (k = 1; k <= count ; k += 4)
1594  {
1595  p_src = (int32_t*) (& (src[k]));
1596  p_src2 = (int32_t*) (& (src[ncfft - k - 3]));
1597  p_twiddles = (int32_t*) (& (twiddles[k - 1]));
1598  p_dst = (int32_t*) (& (dst[k]));
1599  p_dst2 = (int32_t*) (& (dst[ncfft - k - 3]));
1600 
1601  q2_fk = vld2q_s32 (p_src);
1602  q2_fnkc = vld2q_s32 (p_src2);
1603  q2_tw = vld2q_s32 (p_twiddles);
1604  q2_fnkc.val[0] = vrev64q_s32 (q2_fnkc.val[0]);
1605  q2_fnkc.val[1] = vrev64q_s32 (q2_fnkc.val[1]);
1606  q_fnkc_r = vcombine_s32 (vget_high_s32 (q2_fnkc.val[0]), vget_low_s32 (q2_fnkc.val[0]));
1607  q_fnkc_i = vcombine_s32 (vget_high_s32 (q2_fnkc.val[1]), vget_low_s32 (q2_fnkc.val[1]));
1608  q_fnkc_i = vnegq_s32 (q_fnkc_i);
1609 
1610  q_fek_r = vhaddq_s32 (q2_fk.val[0], q_fnkc_r);
1611  q_fek_i = vhaddq_s32 (q2_fk.val[1], q_fnkc_i);
1612  q_tmp0 = vhsubq_s32 (q2_fk.val[0], q_fnkc_r);
1613  q_tmp1 = vhsubq_s32 (q2_fk.val[1], q_fnkc_i);
1614 
1615  q_fok_r = vqdmulhq_s32 (q_tmp0, q2_tw.val[0]);
1616  q_fok_i = vqdmulhq_s32 (q_tmp1, q2_tw.val[0]);
1617  q_tmp2 = vqdmulhq_s32 (q_tmp1, q2_tw.val[1]);
1618  q_tmp3 = vqdmulhq_s32 (q_tmp0, q2_tw.val[1]);
1619  q_fok_r = vaddq_s32 (q_fok_r, q_tmp2);
1620  q_fok_i = vsubq_s32 (q_fok_i, q_tmp3);
1621 
1622  q_dst2_r = vsubq_s32 (q_fek_r, q_fok_r);
1623  q_dst2_i = vsubq_s32 (q_fok_i, q_fek_i);
1624  q2_dst.val[0] = vaddq_s32 (q_fek_r, q_fok_r);
1625  q2_dst.val[1] = vaddq_s32 (q_fek_i, q_fok_i);
1626  q_dst2_r = vrev64q_s32 (q_dst2_r);
1627  q_dst2_i = vrev64q_s32 (q_dst2_i);
1628  q2_dst2.val[0] = vcombine_s32 (vget_high_s32 (q_dst2_r), vget_low_s32 (q_dst2_r));
1629  q2_dst2.val[1] = vcombine_s32 (vget_high_s32 (q_dst2_i), vget_low_s32 (q_dst2_i));
1630  vst2q_s32 (p_dst, q2_dst);
1631  vst2q_s32 (p_dst2, q2_dst2);
1632 
1633  }
1634 
1635  }
1636  else
1637  {
1638  for (k = 1; k <= count ; k += 4)
1639  {
1640  p_src = (int32_t*) (& (src[k]));
1641  p_src2 = (int32_t*) (& (src[ncfft - k - 3]));
1642  p_twiddles = (int32_t*) (& (twiddles[k - 1]));
1643  p_dst = (int32_t*) (& (dst[k]));
1644  p_dst2 = (int32_t*) (& (dst[ncfft - k - 3]));
1645 
1646  q2_fk = vld2q_s32 (p_src);
1647  q2_fnkc = vld2q_s32 (p_src2);
1648  q2_tw = vld2q_s32 (p_twiddles);
1649  q2_fnkc.val[0] = vrev64q_s32 (q2_fnkc.val[0]);
1650  q2_fnkc.val[1] = vrev64q_s32 (q2_fnkc.val[1]);
1651  q_fnkc_r = vcombine_s32 (vget_high_s32 (q2_fnkc.val[0]), vget_low_s32 (q2_fnkc.val[0]));
1652  q_fnkc_i = vcombine_s32 (vget_high_s32 (q2_fnkc.val[1]), vget_low_s32 (q2_fnkc.val[1]));
1653  q_fnkc_i = vnegq_s32 (q_fnkc_i);
1654 
1655  q_fek_r = vaddq_s32 (q2_fk.val[0], q_fnkc_r);
1656  q_fek_i = vaddq_s32 (q2_fk.val[1], q_fnkc_i);
1657  q_tmp0 = vsubq_s32 (q2_fk.val[0], q_fnkc_r);
1658  q_tmp1 = vsubq_s32 (q2_fk.val[1], q_fnkc_i);
1659 
1660  q_fok_r = vqdmulhq_s32 (q_tmp0, q2_tw.val[0]);
1661  q_fok_i = vqdmulhq_s32 (q_tmp1, q2_tw.val[0]);
1662  q_tmp2 = vqdmulhq_s32 (q_tmp1, q2_tw.val[1]);
1663  q_tmp3 = vqdmulhq_s32 (q_tmp0, q2_tw.val[1]);
1664  q_fok_r = vaddq_s32 (q_fok_r, q_tmp2);
1665  q_fok_i = vsubq_s32 (q_fok_i, q_tmp3);
1666 
1667  q_dst2_r = vsubq_s32 (q_fek_r, q_fok_r);
1668  q_dst2_i = vsubq_s32 (q_fok_i, q_fek_i);
1669  q2_dst.val[0] = vaddq_s32 (q_fek_r, q_fok_r);
1670  q2_dst.val[1] = vaddq_s32 (q_fek_i, q_fok_i);
1671  q_dst2_r = vrev64q_s32 (q_dst2_r);
1672  q_dst2_i = vrev64q_s32 (q_dst2_i);
1673  q2_dst2.val[0] = vcombine_s32 (vget_high_s32 (q_dst2_r), vget_low_s32 (q_dst2_r));
1674  q2_dst2.val[1] = vcombine_s32 (vget_high_s32 (q_dst2_i), vget_low_s32 (q_dst2_i));
1675  vst2q_s32 (p_dst, q2_dst);
1676  vst2q_s32 (p_dst2, q2_dst2);
1677 
1678  }
1679  }
1680  }
1681  else
1682  {
1683 
1684  for (k = 1; k <= ncfft / 2; k++)
1685  {
1686  fk = src[k];
1687  fnkc.r = src[ncfft - k].r;
1688  fnkc.i = -src[ncfft - k].i;
1689  if (scaled_flag)
1690  {
1691  NE10_F2I32_FIXDIV (fk, 2);
1692  NE10_F2I32_FIXDIV (fnkc, 2);
1693  }
1694 
1695  fek.r = fk.r + fnkc.r;
1696  fek.i = fk.i + fnkc.i;
1697 
1698  tmp.r = fk.r - fnkc.r;
1699  tmp.i = fk.i - fnkc.i;
1700 
1701  fok.r = ( ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) tmp.r * (twiddles[k - 1]).r) >> 32)) + ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) tmp.i * (twiddles[k - 1]).i) >> 32))) << 1;
1702  fok.i = ( ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) tmp.i * (twiddles[k - 1]).r) >> 32)) - ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) tmp.r * (twiddles[k - 1]).i) >> 32))) << 1;
1703 
1704  dst[k].r = fek.r + fok.r;
1705  dst[k].i = fek.i + fok.i;
1706 
1707  dst[ncfft - k].r = fek.r - fok.r;
1708  dst[ncfft - k].i = fok.i - fek.i;
1709  }
1710  }
1711 }
1712 
1713 
1719  ne10_fft_cpx_int32_t *fin,
1721  ne10_int32_t inverse_fft,
1722  ne10_int32_t scaled_flag)
1723 {
1724  // For input shorter than 15, fall back to c version.
1725  // We would not get much improvement from NEON for these cases.
1726  if (cfg->nfft < 15)
1727  {
1728  ne10_fft_c2c_1d_int32_c (fout, fin, cfg, inverse_fft, scaled_flag);
1729  return;
1730  }
1731 
1732  ne10_int32_t stage_count = cfg->factors[0];
1733  ne10_int32_t algorithm_flag = cfg->factors[2 * (stage_count + 1)];
1734 
1735  assert ((algorithm_flag == NE10_FFT_ALG_DEFAULT)
1736  || (algorithm_flag == NE10_FFT_ALG_ANY));
1737 
1738  // For NE10_FFT_ALG_ANY.
1739  // Function will return inside this branch.
1740  if (algorithm_flag == NE10_FFT_ALG_ANY)
1741  {
1742  if (inverse_fft)
1743  {
1745  cfg->factors, cfg->twiddles, cfg->buffer, scaled_flag);
1746  }
1747  else
1748  {
1750  cfg->factors, cfg->twiddles, cfg->buffer, scaled_flag);
1751  }
1752  return;
1753  }
1754 
1755  if (scaled_flag)
1756  {
1757  if (inverse_fft)
1758  {
1759  switch (cfg->nfft)
1760  {
1761  case 4:
1762  ne10_fft4_backward_int32_scaled (fout, fin);
1763  break;
1764  case 8:
1765  ne10_fft8_backward_int32_scaled (fout, fin);
1766  break;
1767  case 16:
1768  ne10_fft16_backward_int32_scaled_neon (fout, fin, cfg->twiddles);
1769  break;
1770  default:
1772  break;
1773  }
1774  }
1775  else
1776  {
1777  switch (cfg->nfft)
1778  {
1779  case 4:
1780  ne10_fft4_forward_int32_scaled (fout, fin);
1781  break;
1782  case 8:
1783  ne10_fft8_forward_int32_scaled (fout, fin);
1784  break;
1785  case 16:
1786  ne10_fft16_forward_int32_scaled_neon (fout, fin, cfg->twiddles);
1787  break;
1788  default:
1790  break;
1791  }
1792  }
1793  }
1794  else
1795  {
1796  if (inverse_fft)
1797  {
1798  switch (cfg->nfft)
1799  {
1800  case 4:
1801  ne10_fft4_backward_int32_unscaled (fout, fin);
1802  break;
1803  case 8:
1804  ne10_fft8_backward_int32_unscaled (fout, fin);
1805  break;
1806  case 16:
1807  ne10_fft16_backward_int32_unscaled_neon (fout, fin, cfg->twiddles);
1808  break;
1809  default:
1811  break;
1812  }
1813  }
1814  else
1815  {
1816  switch (cfg->nfft)
1817  {
1818  case 4:
1819  ne10_fft4_forward_int32_unscaled (fout, fin);
1820  break;
1821  case 8:
1822  ne10_fft8_forward_int32_unscaled (fout, fin);
1823  break;
1824  case 16:
1825  ne10_fft16_forward_int32_unscaled_neon (fout, fin, cfg->twiddles);
1826  break;
1827  default:
1829  break;
1830  }
1831  }
1832  }
1833 }
1834 
1840  ne10_int32_t *fin,
1842  ne10_int32_t scaled_flag)
1843 {
1844  ne10_fft_cpx_int32_t * tmpbuf1 = cfg->buffer;
1845  ne10_fft_cpx_int32_t * tmpbuf2 = cfg->buffer + cfg->ncfft;
1846  ne10_fft_state_int32_t c2c_state;
1847 
1848  c2c_state.nfft = cfg->ncfft;
1849  c2c_state.factors = cfg->factors;
1850  c2c_state.twiddles = cfg->twiddles;
1851  c2c_state.buffer = tmpbuf2;
1852 
1853  ne10_fft_c2c_1d_int32_neon (tmpbuf1, (ne10_fft_cpx_int32_t*) fin, &c2c_state, 0, scaled_flag);
1854  ne10_fft_split_r2c_1d_int32_neon (fout, tmpbuf1, cfg->super_twiddles, cfg->ncfft, scaled_flag);
1855 }
1856 
1862  ne10_fft_cpx_int32_t *fin,
1864  ne10_int32_t scaled_flag)
1865 {
1866  ne10_fft_cpx_int32_t * tmpbuf1 = cfg->buffer;
1867  ne10_fft_cpx_int32_t * tmpbuf2 = cfg->buffer + cfg->ncfft;
1868  ne10_fft_state_int32_t c2c_state;
1869 
1870  c2c_state.nfft = cfg->ncfft;
1871  c2c_state.factors = cfg->factors;
1872  c2c_state.twiddles = cfg->twiddles;
1873  c2c_state.buffer = tmpbuf2;
1874 
1875  ne10_fft_split_c2r_1d_int32_neon (tmpbuf1, fin, cfg->super_twiddles, cfg->ncfft, scaled_flag);
1876  ne10_fft_c2c_1d_int32_neon ( (ne10_fft_cpx_int32_t*) fout, tmpbuf1, &c2c_state, 1, scaled_flag);
1877 }
#define NE10_FFT_ALG_DEFAULT
Definition: NE10_fft.h:57
void ne10_fft_c2c_1d_int32_neon(ne10_fft_cpx_int32_t *fout, ne10_fft_cpx_int32_t *fin, ne10_fft_cfg_int32_t cfg, ne10_int32_t inverse_fft, ne10_int32_t scaled_flag)
Specific implementation of ne10_fft_c2c_1d_int32 using NEON SIMD capabilities.
#define FFT4_FS
#define RADIX8x4_LS_02_SCALED
ne10_fft_cpx_int32_t * twiddles
Definition: NE10_types.h:335
void ne10_mixed_radix_fft_backward_int32_scaled_neon(ne10_fft_cpx_int32_t *Fout, ne10_fft_cpx_int32_t *fin, ne10_int32_t *factors, ne10_fft_cpx_int32_t *twiddles, ne10_fft_cpx_int32_t *buffer) asm("ne10_mixed_radix_fft_backward_int32_scaled_neon")
void ne10_fft_c2r_1d_int32_neon(ne10_int32_t *fout, ne10_fft_cpx_int32_t *fin, ne10_fft_r2c_cfg_int32_t cfg, ne10_int32_t scaled_flag)
Specific implementation of ne10_fft_c2r_1d_int32 using NEON SIMD capabilities.
#define RADIX4x4_WITH_TW_START
int32_t ne10_int32_t
Definition: NE10_types.h:76
#define RADIX8x4_START
void ne10_mixed_radix_fft_forward_int32_unscaled_neon(ne10_fft_cpx_int32_t *Fout, ne10_fft_cpx_int32_t *fin, ne10_int32_t *factors, ne10_fft_cpx_int32_t *twiddles, ne10_fft_cpx_int32_t *buffer) asm("ne10_mixed_radix_fft_forward_int32_unscaled_neon")
ne10_int32_t * factors
Definition: NE10_types.h:346
#define RADIX4x4_WITH_TW_LS_02
#define RADIX8x4_INV_S357
#define RADIX4x4_WITHOUT_TW_S0
#define FFT8_INV_LS
#define NE10_F2I32_SAMPPROD
Definition: NE10_macros.h:83
#define RADIX4x4_WITH_TW_LOAD
#define RADIX8x4_FS_S0_SCALED
#define RADIX8x4_LOAD
#define RADIX4x4_WITHOUT_TW_START
#define NE10_F2I32_FIXDIV(c, div)
Definition: NE10_macros.h:87
#define FFT4_FWD_LS
#define FFT8_FS_SCALED
#define FFT16_LS_START
void ne10_fft_r2c_1d_int32_neon(ne10_fft_cpx_int32_t *fout, ne10_int32_t *fin, ne10_fft_r2c_cfg_int32_t cfg, ne10_int32_t scaled_flag)
Specific implementation of ne10_fft_r2c_1d_int32 using NEON SIMD capabilities.
#define FFT16_FS
void ne10_fft_c2c_1d_int32_c(ne10_fft_cpx_int32_t *fout, ne10_fft_cpx_int32_t *fin, ne10_fft_cfg_int32_t cfg, ne10_int32_t inverse_fft, ne10_int32_t scaled_flag)
Specific implementation of ne10_fft_c2c_1d_int32 using plain C.
void ne10_mixed_radix_fft_backward_int32_unscaled_neon(ne10_fft_cpx_int32_t *Fout, ne10_fft_cpx_int32_t *fin, ne10_int32_t *factors, ne10_fft_cpx_int32_t *twiddles, ne10_fft_cpx_int32_t *buffer) asm("ne10_mixed_radix_fft_backward_int32_unscaled_neon")
#define FFT16_LS_LOAD
Structure for the 32-bit fixed point FFT function.
Definition: NE10_types.h:325
ne10_int32_t i
Definition: NE10_types.h:328
#define RADIX4x4_WITHOUT_TW_S0_SCALED
ne10_int32_t * factors
Definition: NE10_types.h:334
#define RADIX4x4_WITHOUT_TW_STORE
#define ne10_mixed_radix_fft_forward_int32_neon(scaled)
#define FFT8_FWD_LS
#define RADIX4x4_WITH_TW_STORE
void ne10_mixed_radix_generic_butterfly_inverse_int32_neon(ne10_fft_cpx_int32_t *Fout, const ne10_fft_cpx_int32_t *Fin, const ne10_int32_t *factors, const ne10_fft_cpx_int32_t *twiddles, ne10_fft_cpx_int32_t *buffer, const ne10_int32_t scaled_flag)
#define RADIX8x4_LS_02
#define FFT16_FWD_LS
#define NE10_FFT_ALG_ANY
Definition: NE10_fft.h:58
#define RADIX4x4_WITHOUT_TW_LOAD
#define FFT16_LS_02_SCALED
ne10_fft_cpx_int32_t * twiddles
Definition: NE10_types.h:347
#define FFT4_FS_SCALED
void ne10_mixed_radix_generic_butterfly_int32_neon(ne10_fft_cpx_int32_t *Fout, const ne10_fft_cpx_int32_t *Fin, const ne10_int32_t *factors, const ne10_fft_cpx_int32_t *twiddles, ne10_fft_cpx_int32_t *buffer, const ne10_int32_t scaled_flag)
ne10_fft_cpx_int32_t * buffer
Definition: NE10_types.h:336
#define FFT16_FS_START
ne10_fft_cpx_int32_t * buffer
Definition: NE10_types.h:349
#define FFT4_INV_LS
#define RADIX4x4_WITH_TW_LS_02_SCALED
ne10_int32_t r
Definition: NE10_types.h:327
#define FFT16_LS_02
#define RADIX8x4_STORE
#define FFT16_FWD_LS_S0
#define RADIX8x4_FS_S0
#define RADIX8x4_FWD_S357
#define RADIX4x4_WITH_TW_S1_FWD
#define FFT16_INV_LS
#define FFT8_FS_START
#define RADIX4x4_WITH_TW_S1_INV
#define FFT16_INV_LS_S0
#define ne10_mixed_radix_fft_backward_int32_neon(scaled)
ne10_fft_cpx_int32_t * super_twiddles
Definition: NE10_types.h:348
void ne10_mixed_radix_fft_forward_int32_scaled_neon(ne10_fft_cpx_int32_t *Fout, ne10_fft_cpx_int32_t *fin, ne10_int32_t *factors, ne10_fft_cpx_int32_t *twiddles, ne10_fft_cpx_int32_t *buffer) asm("ne10_mixed_radix_fft_forward_int32_scaled_neon")
#define FFT4_FS_START
#define FFT16_FS_SCALED
#define FFT16_ST