Project Ne10
An open, optimized software library for the ARM architecture.
NE10_fft_float32.neonintrinsic.c
Go to the documentation of this file.
1 /*
2  * Copyright 2014-16 ARM Limited and Contributors.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of ARM Limited nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /*
29  * NE10 Library : dsp/NE10_fft_float32.neon.c
30  */
31 
32 #include <arm_neon.h>
33 
34 #include "NE10_types.h"
35 #include "NE10_macros.h"
36 #include "NE10_fft.h"
37 #include "NE10_dsp.h"
38 
39 static inline void ne10_fft4_forward_float32 (ne10_fft_cpx_float32_t *out,
41 {
42  ne10_float32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i;
43  ne10_float32_t tmp_r, tmp_i;
44 
45  // X[0] +/- X[2N/4]
46  tmp_r = in[0].r + in[2].r;
47  tmp_i = in[0].i + in[2].i;
48  s2_r = in[0].r - in[2].r;
49  s2_i = in[0].i - in[2].i;
50 
51  // X[N/4] +/- X[3N/4]
52  s0_r = in[1].r + in[3].r;
53  s0_i = in[1].i + in[3].i;
54  s1_r = in[1].r - in[3].r;
55  s1_i = in[1].i - in[3].i;
56 
57  // Combine the (X[0] +/- X[2N/4]) and (X[N/4] +/- X[3N/4]) components for the full
58  // radix-4 butterfly, storing the results.
59  out[2].r = tmp_r - s0_r;
60  out[2].i = tmp_i - s0_i;
61  out[0].r = tmp_r + s0_r;
62  out[0].i = tmp_i + s0_i;
63  out[1].r = s2_r + s1_i;
64  out[1].i = s2_i - s1_r;
65  out[3].r = s2_r - s1_i;
66  out[3].i = s2_i + s1_r;
67 }
68 
69 static inline void ne10_fft4_backward_float32 (ne10_fft_cpx_float32_t *out,
71 {
72  ne10_float32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i;
73  ne10_float32_t tmp_r, tmp_i;
74 
75  // X[0] +/- X[2N/4]
76  tmp_r = in[0].r + in[2].r;
77  tmp_i = in[0].i + in[2].i;
78  s2_r = in[0].r - in[2].r;
79  s2_i = in[0].i - in[2].i;
80 
81  // X[N/4] +/- X[3N/4]
82  s0_r = in[1].r + in[3].r;
83  s0_i = in[1].i + in[3].i;
84  s1_r = in[1].r - in[3].r;
85  s1_i = in[1].i - in[3].i;
86 
87  // Combine the (X[0] +/- X[2N/4]) and (X[N/4] +/- X[3N/4]) components for the full
88  // radix-4 butterfly, multiplying by (1 / nfft) and storing the results.
89  out[2].r = (tmp_r - s0_r) * 0.25f;
90  out[2].i = (tmp_i - s0_i) * 0.25f;
91  out[0].r = (tmp_r + s0_r) * 0.25f;
92  out[0].i = (tmp_i + s0_i) * 0.25f;
93  out[1].r = (s2_r - s1_i) * 0.25f;
94  out[1].i = (s2_i + s1_r) * 0.25f;
95  out[3].r = (s2_r + s1_i) * 0.25f;
96  out[3].i = (s2_i - s1_r) * 0.25f;
97 }
98 
99 
100 static inline void ne10_fft8_forward_float32 (ne10_fft_cpx_float32_t *out,
102 {
103  ne10_float32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
104  ne10_float32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
105  const ne10_float32_t TW_81 = 0.70710678;
106 
107  // Prepare sums for the butterfly calculations
108  s0_r = in[0].r + in[4].r;
109  s0_i = in[0].i + in[4].i;
110  s1_r = in[0].r - in[4].r;
111  s1_i = in[0].i - in[4].i;
112  s2_r = in[1].r + in[5].r;
113  s2_i = in[1].i + in[5].i;
114  s3_r = in[1].r - in[5].r;
115  s3_i = in[1].i - in[5].i;
116  s4_r = in[2].r + in[6].r;
117  s4_i = in[2].i + in[6].i;
118  s5_r = in[2].r - in[6].r;
119  s5_i = in[2].i - in[6].i;
120  s6_r = in[3].r + in[7].r;
121  s6_i = in[3].i + in[7].i;
122  s7_r = in[3].r - in[7].r;
123  s7_i = in[3].i - in[7].i;
124 
125  // Combine the components without twiddles, storing the results.
126  t0_r = s0_r - s4_r;
127  t0_i = s0_i - s4_i;
128  t1_r = s0_r + s4_r;
129  t1_i = s0_i + s4_i;
130  t2_r = s2_r + s6_r;
131  t2_i = s2_i + s6_i;
132  t3_r = s2_r - s6_r;
133  t3_i = s2_i - s6_i;
134  out[0].r = t1_r + t2_r;
135  out[0].i = t1_i + t2_i;
136  out[4].r = t1_r - t2_r;
137  out[4].i = t1_i - t2_i;
138  out[2].r = t0_r + t3_i;
139  out[2].i = t0_i - t3_r;
140  out[6].r = t0_r - t3_i;
141  out[6].i = t0_i + t3_r;
142 
143  // Combine the components with twiddles (using hardcoded radix-8 twiddle values),
144  // storing the results.
145  t4_r = (s3_r + s3_i) * TW_81;
146  t4_i = - (s3_r - s3_i) * TW_81;
147  t5_r = (s7_r - s7_i) * TW_81;
148  t5_i = (s7_r + s7_i) * TW_81;
149  t0_r = s1_r - s5_i;
150  t0_i = s1_i + s5_r;
151  t1_r = s1_r + s5_i;
152  t1_i = s1_i - s5_r;
153  t2_r = t4_r - t5_r;
154  t2_i = t4_i - t5_i;
155  t3_r = t4_r + t5_r;
156  t3_i = t4_i + t5_i;
157  out[1].r = t1_r + t2_r;
158  out[1].i = t1_i + t2_i;
159  out[5].r = t1_r - t2_r;
160  out[5].i = t1_i - t2_i;
161  out[3].r = t0_r + t3_i;
162  out[3].i = t0_i - t3_r;
163  out[7].r = t0_r - t3_i;
164  out[7].i = t0_i + t3_r;
165 }
166 
167 static inline void ne10_fft8_backward_float32 (ne10_fft_cpx_float32_t *out,
169 {
170  ne10_float32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
171  ne10_float32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
172  const ne10_float32_t TW_81 = 0.70710678;
173 
174  // Prepare sums for the butterfly calculations
175  s0_r = in[0].r + in[4].r;
176  s0_i = in[0].i + in[4].i;
177  s1_r = in[0].r - in[4].r;
178  s1_i = in[0].i - in[4].i;
179  s2_r = in[1].r + in[5].r;
180  s2_i = in[1].i + in[5].i;
181  s3_r = in[1].r - in[5].r;
182  s3_i = in[1].i - in[5].i;
183  s4_r = in[2].r + in[6].r;
184  s4_i = in[2].i + in[6].i;
185  s5_r = in[2].r - in[6].r;
186  s5_i = in[2].i - in[6].i;
187  s6_r = in[3].r + in[7].r;
188  s6_i = in[3].i + in[7].i;
189  s7_r = in[3].r - in[7].r;
190  s7_i = in[3].i - in[7].i;
191 
192  // Combine the components without twiddles, multiplying by (1 / nfft) and storing the
193  // results.
194  t0_r = s0_r - s4_r;
195  t0_i = s0_i - s4_i;
196  t1_r = s0_r + s4_r;
197  t1_i = s0_i + s4_i;
198  t2_r = s2_r + s6_r;
199  t2_i = s2_i + s6_i;
200  t3_r = s2_r - s6_r;
201  t3_i = s2_i - s6_i;
202  out[0].r = (t1_r + t2_r) * 0.125f;
203  out[0].i = (t1_i + t2_i) * 0.125f;
204  out[4].r = (t1_r - t2_r) * 0.125f;
205  out[4].i = (t1_i - t2_i) * 0.125f;
206  out[2].r = (t0_r - t3_i) * 0.125f;
207  out[2].i = (t0_i + t3_r) * 0.125f;
208  out[6].r = (t0_r + t3_i) * 0.125f;
209  out[6].i = (t0_i - t3_r) * 0.125f;
210 
211  // Combine the components with twiddles (using hardcoded radix-8 twiddle values),
212  // multiplying by (1 / nfft) and storing the results.
213  t4_r = (s3_r - s3_i) * TW_81;
214  t4_i = (s3_r + s3_i) * TW_81;
215  t5_r = (s7_r + s7_i) * TW_81;
216  t5_i = - (s7_r - s7_i) * TW_81;
217  t0_r = s1_r + s5_i;
218  t0_i = s1_i - s5_r;
219  t1_r = s1_r - s5_i;
220  t1_i = s1_i + s5_r;
221  t2_r = t4_r - t5_r;
222  t2_i = t4_i - t5_i;
223  t3_r = t4_r + t5_r;
224  t3_i = t4_i + t5_i;
225  out[1].r = (t1_r + t2_r) * 0.125f;
226  out[1].i = (t1_i + t2_i) * 0.125f;
227  out[5].r = (t1_r - t2_r) * 0.125f;
228  out[5].i = (t1_i - t2_i) * 0.125f;
229  out[3].r = (t0_r - t3_i) * 0.125f;
230  out[3].i = (t0_i + t3_r) * 0.125f;
231  out[7].r = (t0_r + t3_i) * 0.125f;
232  out[7].i = (t0_i - t3_r) * 0.125f;
233 }
234 
235 static void ne10_fft16_forward_float32_neon (ne10_fft_cpx_float32_t * Fout,
237  ne10_fft_cpx_float32_t * twiddles)
238 {
239  ne10_fft_cpx_float32_t *tw1, *tw2, *tw3;
240 
241  // the first stage
242  float32_t *p_src0, *p_src4, *p_src8, *p_src12;
243  float32x4x2_t q2_in_0123, q2_in_4567, q2_in_89ab, q2_in_cdef;
244  float32x4_t q_t0_r, q_t0_i, q_t1_r, q_t1_i, q_t2_r, q_t2_i, q_t3_r, q_t3_i;
245  float32x4_t q_out_r048c, q_out_i048c, q_out_r159d, q_out_i159d;
246  float32x4_t q_out_r26ae, q_out_i26ae, q_out_r37bf, q_out_i37bf;
247  p_src0 = (float32_t*) (& (Fin[0]));
248  p_src4 = (float32_t*) (& (Fin[4]));
249  p_src8 = (float32_t*) (& (Fin[8]));
250  p_src12 = (float32_t*) (& (Fin[12]));
251  q2_in_0123 = vld2q_f32 (p_src0);
252  q2_in_4567 = vld2q_f32 (p_src4);
253  q2_in_89ab = vld2q_f32 (p_src8);
254  q2_in_cdef = vld2q_f32 (p_src12);
255 
256  q_t2_r = vsubq_f32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
257  q_t2_i = vsubq_f32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
258  q_t3_r = vaddq_f32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
259  q_t3_i = vaddq_f32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
260 
261  q_t0_r = vaddq_f32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
262  q_t0_i = vaddq_f32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
263  q_t1_r = vsubq_f32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
264  q_t1_i = vsubq_f32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
265 
266  q_out_r26ae = vsubq_f32 (q_t3_r, q_t0_r);
267  q_out_i26ae = vsubq_f32 (q_t3_i, q_t0_i);
268  q_out_r048c = vaddq_f32 (q_t3_r, q_t0_r);
269  q_out_i048c = vaddq_f32 (q_t3_i, q_t0_i);
270  q_out_r159d = vaddq_f32 (q_t2_r, q_t1_i);
271  q_out_i159d = vsubq_f32 (q_t2_i, q_t1_r);
272  q_out_r37bf = vsubq_f32 (q_t2_r, q_t1_i);
273  q_out_i37bf = vaddq_f32 (q_t2_i, q_t1_r);
274 
275  // second stages
276  float32_t *p_dst0, *p_dst1, *p_dst2, *p_dst3;
277  float32_t *p_tw1, *p_tw2, *p_tw3;
278  float32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i;
279  float32x4_t q_s3_r, q_s3_i, q_s4_r, q_s4_i, q_s5_r, q_s5_i;
280  float32x4x2_t q2_tmp_0, q2_tmp_1, q2_tmp_2, q2_tmp_3;
281  float32x4_t q_in_r0123, q_in_r4567, q_in_r89ab, q_in_rcdef;
282  float32x4_t q_in_i0123, q_in_i4567, q_in_i89ab, q_in_icdef;
283  float32x4x2_t q2_tw1, q2_tw2, q2_tw3;
284  float32x4x2_t q2_out_0123, q2_out_4567, q2_out_89ab, q2_out_cdef;
285  tw1 = twiddles;
286  tw2 = twiddles + 4;
287  tw3 = twiddles + 8;
288  p_dst0 = (float32_t*) (&Fout[0]);
289  p_dst1 = (float32_t*) (&Fout[4]);
290  p_dst2 = (float32_t*) (&Fout[8]);
291  p_dst3 = (float32_t*) (&Fout[12]);
292  p_tw1 = (float32_t*) tw1;
293  p_tw2 = (float32_t*) tw2;
294  p_tw3 = (float32_t*) tw3;
295  q2_tmp_0 = vzipq_f32 (q_out_r048c, q_out_r159d);
296  q2_tmp_1 = vzipq_f32 (q_out_i048c, q_out_i159d);
297  q2_tmp_2 = vzipq_f32 (q_out_r26ae, q_out_r37bf);
298  q2_tmp_3 = vzipq_f32 (q_out_i26ae, q_out_i37bf);
299  q_in_r0123 = vcombine_f32 (vget_low_f32 (q2_tmp_0.val[0]), vget_low_f32 (q2_tmp_2.val[0]));
300  q_in_i0123 = vcombine_f32 (vget_low_f32 (q2_tmp_1.val[0]), vget_low_f32 (q2_tmp_3.val[0]));
301  q_in_r4567 = vcombine_f32 (vget_high_f32 (q2_tmp_0.val[0]), vget_high_f32 (q2_tmp_2.val[0]));
302  q_in_i4567 = vcombine_f32 (vget_high_f32 (q2_tmp_1.val[0]), vget_high_f32 (q2_tmp_3.val[0]));
303  q_in_r89ab = vcombine_f32 (vget_low_f32 (q2_tmp_0.val[1]), vget_low_f32 (q2_tmp_2.val[1]));
304  q_in_i89ab = vcombine_f32 (vget_low_f32 (q2_tmp_1.val[1]), vget_low_f32 (q2_tmp_3.val[1]));
305  q_in_rcdef = vcombine_f32 (vget_high_f32 (q2_tmp_0.val[1]), vget_high_f32 (q2_tmp_2.val[1]));
306  q_in_icdef = vcombine_f32 (vget_high_f32 (q2_tmp_1.val[1]), vget_high_f32 (q2_tmp_3.val[1]));
307  q2_tw1 = vld2q_f32 (p_tw1);
308  q2_tw2 = vld2q_f32 (p_tw2);
309  q2_tw3 = vld2q_f32 (p_tw3);
310 
311  q_s0_r = vmulq_f32 (q_in_r4567, q2_tw1.val[0]);
312  q_s0_i = vmulq_f32 (q_in_r4567, q2_tw1.val[1]);
313  q_s1_r = vmulq_f32 (q_in_r89ab, q2_tw2.val[0]);
314  q_s1_i = vmulq_f32 (q_in_r89ab, q2_tw2.val[1]);
315  q_s2_r = vmulq_f32 (q_in_rcdef, q2_tw3.val[0]);
316  q_s2_i = vmulq_f32 (q_in_rcdef, q2_tw3.val[1]);
317  q_s0_r = vmlsq_f32 (q_s0_r, q_in_i4567, q2_tw1.val[1]);
318  q_s0_i = vmlaq_f32 (q_s0_i, q_in_i4567, q2_tw1.val[0]);
319  q_s1_r = vmlsq_f32 (q_s1_r, q_in_i89ab, q2_tw2.val[1]);
320  q_s1_i = vmlaq_f32 (q_s1_i, q_in_i89ab, q2_tw2.val[0]);
321  q_s2_r = vmlsq_f32 (q_s2_r, q_in_icdef, q2_tw3.val[1]);
322  q_s2_i = vmlaq_f32 (q_s2_i, q_in_icdef, q2_tw3.val[0]);
323 
324 
325  q_s5_r = vsubq_f32 (q_in_r0123, q_s1_r);
326  q_s5_i = vsubq_f32 (q_in_i0123, q_s1_i);
327  q2_out_0123.val[0] = vaddq_f32 (q_in_r0123, q_s1_r);
328  q2_out_0123.val[1] = vaddq_f32 (q_in_i0123, q_s1_i);
329 
330  q_s3_r = vaddq_f32 (q_s0_r, q_s2_r);
331  q_s3_i = vaddq_f32 (q_s0_i, q_s2_i);
332  q_s4_r = vsubq_f32 (q_s0_r, q_s2_r);
333  q_s4_i = vsubq_f32 (q_s0_i, q_s2_i);
334 
335  q2_out_89ab.val[0] = vsubq_f32 (q2_out_0123.val[0], q_s3_r);
336  q2_out_89ab.val[1] = vsubq_f32 (q2_out_0123.val[1], q_s3_i);
337  q2_out_0123.val[0] = vaddq_f32 (q2_out_0123.val[0], q_s3_r);
338  q2_out_0123.val[1] = vaddq_f32 (q2_out_0123.val[1], q_s3_i);
339 
340  q2_out_4567.val[0] = vaddq_f32 (q_s5_r, q_s4_i);
341  q2_out_4567.val[1] = vsubq_f32 (q_s5_i, q_s4_r);
342  q2_out_cdef.val[0] = vsubq_f32 (q_s5_r, q_s4_i);
343  q2_out_cdef.val[1] = vaddq_f32 (q_s5_i, q_s4_r);
344 
345  vst2q_f32 (p_dst0, q2_out_0123);
346  vst2q_f32 (p_dst1, q2_out_4567);
347  vst2q_f32 (p_dst2, q2_out_89ab);
348  vst2q_f32 (p_dst3, q2_out_cdef);
349 }
350 
351 static void ne10_fft16_backward_float32_neon (ne10_fft_cpx_float32_t * Fout,
353  ne10_fft_cpx_float32_t * twiddles)
354 {
355  ne10_fft_cpx_float32_t *tw1, *tw2, *tw3;
356 
357  // the first stage
358  float32_t *p_src0, *p_src4, *p_src8, *p_src12;
359  float32x4x2_t q2_in_0123, q2_in_4567, q2_in_89ab, q2_in_cdef;
360  float32x4_t q_t0_r, q_t0_i, q_t1_r, q_t1_i, q_t2_r, q_t2_i, q_t3_r, q_t3_i;
361  float32x4_t q_out_r048c, q_out_i048c, q_out_r159d, q_out_i159d;
362  float32x4_t q_out_r26ae, q_out_i26ae, q_out_r37bf, q_out_i37bf;
363  p_src0 = (float32_t*) (& (Fin[0]));
364  p_src4 = (float32_t*) (& (Fin[4]));
365  p_src8 = (float32_t*) (& (Fin[8]));
366  p_src12 = (float32_t*) (& (Fin[12]));
367  q2_in_0123 = vld2q_f32 (p_src0);
368  q2_in_4567 = vld2q_f32 (p_src4);
369  q2_in_89ab = vld2q_f32 (p_src8);
370  q2_in_cdef = vld2q_f32 (p_src12);
371 
372  q_t2_r = vsubq_f32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
373  q_t2_i = vsubq_f32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
374  q_t3_r = vaddq_f32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
375  q_t3_i = vaddq_f32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
376 
377  q_t0_r = vaddq_f32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
378  q_t0_i = vaddq_f32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
379  q_t1_r = vsubq_f32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
380  q_t1_i = vsubq_f32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
381 
382  q_out_r26ae = vsubq_f32 (q_t3_r, q_t0_r);
383  q_out_i26ae = vsubq_f32 (q_t3_i, q_t0_i);
384  q_out_r048c = vaddq_f32 (q_t3_r, q_t0_r);
385  q_out_i048c = vaddq_f32 (q_t3_i, q_t0_i);
386  q_out_r159d = vsubq_f32 (q_t2_r, q_t1_i);
387  q_out_i159d = vaddq_f32 (q_t2_i, q_t1_r);
388  q_out_r37bf = vaddq_f32 (q_t2_r, q_t1_i);
389  q_out_i37bf = vsubq_f32 (q_t2_i, q_t1_r);
390 
391  // second stages
392  float32_t *p_dst0, *p_dst1, *p_dst2, *p_dst3;
393  float32_t *p_tw1, *p_tw2, *p_tw3;
394  float32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i;
395  float32x4_t q_s3_r, q_s3_i, q_s4_r, q_s4_i, q_s5_r, q_s5_i;
396  float32x4x2_t q2_tmp_0, q2_tmp_1, q2_tmp_2, q2_tmp_3;
397  float32x4_t q_in_r0123, q_in_r4567, q_in_r89ab, q_in_rcdef;
398  float32x4_t q_in_i0123, q_in_i4567, q_in_i89ab, q_in_icdef;
399  float32x4x2_t q2_tw1, q2_tw2, q2_tw3;
400  float32x4x2_t q2_out_0123, q2_out_4567, q2_out_89ab, q2_out_cdef;
401  float32x4_t q_one_by_nfft;
402  tw1 = twiddles;
403  tw2 = twiddles + 4;
404  tw3 = twiddles + 8;
405  p_dst0 = (float32_t*) (&Fout[0]);
406  p_dst1 = (float32_t*) (&Fout[4]);
407  p_dst2 = (float32_t*) (&Fout[8]);
408  p_dst3 = (float32_t*) (&Fout[12]);
409  p_tw1 = (float32_t*) tw1;
410  p_tw2 = (float32_t*) tw2;
411  p_tw3 = (float32_t*) tw3;
412  q2_tmp_0 = vzipq_f32 (q_out_r048c, q_out_r159d);
413  q2_tmp_1 = vzipq_f32 (q_out_i048c, q_out_i159d);
414  q2_tmp_2 = vzipq_f32 (q_out_r26ae, q_out_r37bf);
415  q2_tmp_3 = vzipq_f32 (q_out_i26ae, q_out_i37bf);
416  q_in_r0123 = vcombine_f32 (vget_low_f32 (q2_tmp_0.val[0]), vget_low_f32 (q2_tmp_2.val[0]));
417  q_in_i0123 = vcombine_f32 (vget_low_f32 (q2_tmp_1.val[0]), vget_low_f32 (q2_tmp_3.val[0]));
418  q_in_r4567 = vcombine_f32 (vget_high_f32 (q2_tmp_0.val[0]), vget_high_f32 (q2_tmp_2.val[0]));
419  q_in_i4567 = vcombine_f32 (vget_high_f32 (q2_tmp_1.val[0]), vget_high_f32 (q2_tmp_3.val[0]));
420  q_in_r89ab = vcombine_f32 (vget_low_f32 (q2_tmp_0.val[1]), vget_low_f32 (q2_tmp_2.val[1]));
421  q_in_i89ab = vcombine_f32 (vget_low_f32 (q2_tmp_1.val[1]), vget_low_f32 (q2_tmp_3.val[1]));
422  q_in_rcdef = vcombine_f32 (vget_high_f32 (q2_tmp_0.val[1]), vget_high_f32 (q2_tmp_2.val[1]));
423  q_in_icdef = vcombine_f32 (vget_high_f32 (q2_tmp_1.val[1]), vget_high_f32 (q2_tmp_3.val[1]));
424  q2_tw1 = vld2q_f32 (p_tw1);
425  q2_tw2 = vld2q_f32 (p_tw2);
426  q2_tw3 = vld2q_f32 (p_tw3);
427 
428  q_s0_r = vmulq_f32 (q_in_r4567, q2_tw1.val[0]);
429  q_s0_i = vmulq_f32 (q_in_i4567, q2_tw1.val[0]);
430  q_s1_r = vmulq_f32 (q_in_r89ab, q2_tw2.val[0]);
431  q_s1_i = vmulq_f32 (q_in_i89ab, q2_tw2.val[0]);
432  q_s2_r = vmulq_f32 (q_in_rcdef, q2_tw3.val[0]);
433  q_s2_i = vmulq_f32 (q_in_icdef, q2_tw3.val[0]);
434  q_s0_r = vmlaq_f32 (q_s0_r, q_in_i4567, q2_tw1.val[1]);
435  q_s0_i = vmlsq_f32 (q_s0_i, q_in_r4567, q2_tw1.val[1]);
436  q_s1_r = vmlaq_f32 (q_s1_r, q_in_i89ab, q2_tw2.val[1]);
437  q_s1_i = vmlsq_f32 (q_s1_i, q_in_r89ab, q2_tw2.val[1]);
438  q_s2_r = vmlaq_f32 (q_s2_r, q_in_icdef, q2_tw3.val[1]);
439  q_s2_i = vmlsq_f32 (q_s2_i, q_in_rcdef, q2_tw3.val[1]);
440 
441  q_s5_r = vsubq_f32 (q_in_r0123, q_s1_r);
442  q_s5_i = vsubq_f32 (q_in_i0123, q_s1_i);
443  q2_out_0123.val[0] = vaddq_f32 (q_in_r0123, q_s1_r);
444  q2_out_0123.val[1] = vaddq_f32 (q_in_i0123, q_s1_i);
445 
446  q_s3_r = vaddq_f32 (q_s0_r, q_s2_r);
447  q_s3_i = vaddq_f32 (q_s0_i, q_s2_i);
448  q_s4_r = vsubq_f32 (q_s0_r, q_s2_r);
449  q_s4_i = vsubq_f32 (q_s0_i, q_s2_i);
450 
451  q_one_by_nfft = vdupq_n_f32 (0.0625f);
452  q2_out_89ab.val[0] = vsubq_f32 (q2_out_0123.val[0], q_s3_r);
453  q2_out_89ab.val[1] = vsubq_f32 (q2_out_0123.val[1], q_s3_i);
454  q2_out_0123.val[0] = vaddq_f32 (q2_out_0123.val[0], q_s3_r);
455  q2_out_0123.val[1] = vaddq_f32 (q2_out_0123.val[1], q_s3_i);
456 
457  q2_out_4567.val[0] = vsubq_f32 (q_s5_r, q_s4_i);
458  q2_out_4567.val[1] = vaddq_f32 (q_s5_i, q_s4_r);
459  q2_out_cdef.val[0] = vaddq_f32 (q_s5_r, q_s4_i);
460  q2_out_cdef.val[1] = vsubq_f32 (q_s5_i, q_s4_r);
461 
462  q2_out_89ab.val[0] = vmulq_f32 (q2_out_89ab.val[0], q_one_by_nfft);
463  q2_out_89ab.val[1] = vmulq_f32 (q2_out_89ab.val[1], q_one_by_nfft);
464  q2_out_0123.val[0] = vmulq_f32 (q2_out_0123.val[0], q_one_by_nfft);
465  q2_out_0123.val[1] = vmulq_f32 (q2_out_0123.val[1], q_one_by_nfft);
466  q2_out_4567.val[0] = vmulq_f32 (q2_out_4567.val[0], q_one_by_nfft);
467  q2_out_4567.val[1] = vmulq_f32 (q2_out_4567.val[1], q_one_by_nfft);
468  q2_out_cdef.val[0] = vmulq_f32 (q2_out_cdef.val[0], q_one_by_nfft);
469  q2_out_cdef.val[1] = vmulq_f32 (q2_out_cdef.val[1], q_one_by_nfft);
470 
471  vst2q_f32 (p_dst0, q2_out_0123);
472  vst2q_f32 (p_dst1, q2_out_4567);
473  vst2q_f32 (p_dst2, q2_out_89ab);
474  vst2q_f32 (p_dst3, q2_out_cdef);
475 }
476 
477 static inline void ne10_radix8x4_neon (ne10_fft_cpx_float32_t *out,
479  ne10_int32_t stride)
480 {
481  ne10_int32_t f_count;
482  ne10_int32_t src_step = stride << 1; // ne10_fft_cpx_float32_t -> float32_t offset
483  float32_t *p_src = (float32_t *) in;
484  float32_t *p_dst = (float32_t *) out;
485  const ne10_float32_t TW_81 = 0.70710678;
486  const ne10_float32_t TW_81N = -0.70710678;
487 
488  float32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3, q2_in4, q2_in5, q2_in6, q2_in7;
489  float32x4_t q_sin0_r, q_sin0_i, q_sin1_r, q_sin1_i, q_sin2_r, q_sin2_i, q_sin3_r, q_sin3_i;
490  float32x4_t q_sin4_r, q_sin4_i, q_sin5_r, q_sin5_i, q_sin6_r, q_sin6_i, q_sin7_r, q_sin7_i;
491  float32x4_t q_s3_r, q_s3_i, q_s5_r, q_s5_i, q_s7_r, q_s7_i;
492  float32x4_t q_s8_r, q_s8_i, q_s9_r, q_s9_i, q_s10_r, q_s10_i, q_s11_r, q_s11_i;
493  float32x4_t q_s12_r, q_s12_i, q_s13_r, q_s13_i, q_s14_r, q_s14_i, q_s15_r, q_s15_i;
494  float32x4_t q_out0_r, q_out0_i, q_out1_r, q_out1_i, q_out2_r, q_out2_i, q_out3_r, q_out3_i;
495  float32x4_t q_out4_r, q_out4_i, q_out5_r, q_out5_i, q_out6_r, q_out6_i, q_out7_r, q_out7_i;
496  float32x4x2_t q2_tmp0, q2_tmp1, q2_tmp2, q2_tmp3, q2_tmp4, q2_tmp5, q2_tmp6, q2_tmp7;
497  float32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3, q2_out4, q2_out5, q2_out6, q2_out7;
498  float32x4_t q_tw_81, q_tw_81n;
499 
500  // This loop is unrolled four times, taking 16 NEON quadword registers of input to
501  // process four radix-8 butterflies per loop iteration (in contrast to the single
502  // radix-8 butterfly per iteration in the pure C counterpart).
503  for (f_count = 0; f_count < stride; f_count += 4)
504  {
505  // Load the input values. Each q2_in* is a pair of two NEON quadword registers,
506  // each member of which can hold two complex values (i.e. four float32 values).
507  // Though we load four complex values at a time into the pairs here using VLD2Q,
508  // the layout of the data ensures that we will not need to perform arithmetic
509  // between values in the same vector.
510  q2_in0 = vld2q_f32 (p_src);
511  p_src += src_step;
512  q2_in2 = vld2q_f32 (p_src);
513  p_src += src_step;
514  q2_in4 = vld2q_f32 (p_src);
515  p_src += src_step;
516  q2_in6 = vld2q_f32 (p_src);
517  p_src += src_step;
518  q2_in1 = vld2q_f32 (p_src);
519  p_src += src_step;
520  q2_in3 = vld2q_f32 (p_src);
521  p_src += src_step;
522  q2_in5 = vld2q_f32 (p_src);
523  p_src += src_step;
524  q2_in7 = vld2q_f32 (p_src);
525  p_src += src_step;
526 
527  // Calculate sums for the butterfly calculations between the <X[0], X[4N/8]>,
528  // <X[N/8], X[5N/8]>, <X[2N/8], X[6N/8]>, and <X[3N/8], X[7N/8]> components.
529  q_sin0_r = vaddq_f32 (q2_in0.val[0], q2_in1.val[0]);
530  q_sin0_i = vaddq_f32 (q2_in0.val[1], q2_in1.val[1]);
531  q_sin1_r = vsubq_f32 (q2_in0.val[0], q2_in1.val[0]);
532  q_sin1_i = vsubq_f32 (q2_in0.val[1], q2_in1.val[1]);
533  q_sin2_r = vaddq_f32 (q2_in2.val[0], q2_in3.val[0]);
534  q_sin2_i = vaddq_f32 (q2_in2.val[1], q2_in3.val[1]);
535  q_sin3_r = vsubq_f32 (q2_in2.val[0], q2_in3.val[0]);
536  q_sin3_i = vsubq_f32 (q2_in2.val[1], q2_in3.val[1]);
537  q_sin4_r = vaddq_f32 (q2_in4.val[0], q2_in5.val[0]);
538  q_sin4_i = vaddq_f32 (q2_in4.val[1], q2_in5.val[1]);
539  q_sin5_r = vsubq_f32 (q2_in4.val[0], q2_in5.val[0]);
540  q_sin5_i = vsubq_f32 (q2_in4.val[1], q2_in5.val[1]);
541  q_sin6_r = vaddq_f32 (q2_in6.val[0], q2_in7.val[0]);
542  q_sin6_i = vaddq_f32 (q2_in6.val[1], q2_in7.val[1]);
543  q_sin7_r = vsubq_f32 (q2_in6.val[0], q2_in7.val[0]);
544  q_sin7_i = vsubq_f32 (q2_in6.val[1], q2_in7.val[1]);
545 
546  // Multiply some of these by hardcoded radix-8 twiddles
547  q_tw_81 = vdupq_n_f32 (TW_81);
548  q_tw_81n = vdupq_n_f32 (TW_81N);
549  q_s5_r = q_sin5_i;
550  q_s5_i = vnegq_f32 (q_sin5_r);
551  q_s3_r = vaddq_f32 (q_sin3_r, q_sin3_i);
552  q_s3_i = vsubq_f32 (q_sin3_i, q_sin3_r);
553  q_s7_r = vsubq_f32 (q_sin7_r, q_sin7_i);
554  q_s7_i = vaddq_f32 (q_sin7_i, q_sin7_r);
555  q_s3_r = vmulq_f32 (q_s3_r, q_tw_81);
556  q_s3_i = vmulq_f32 (q_s3_i, q_tw_81);
557  q_s7_r = vmulq_f32 (q_s7_r, q_tw_81n);
558  q_s7_i = vmulq_f32 (q_s7_i, q_tw_81n);
559 
560  // Combine the first set of pairs of these sums
561  q_s8_r = vaddq_f32 (q_sin0_r, q_sin4_r);
562  q_s8_i = vaddq_f32 (q_sin0_i, q_sin4_i);
563  q_s9_r = vaddq_f32 (q_sin1_r, q_s5_r);
564  q_s9_i = vaddq_f32 (q_sin1_i, q_s5_i);
565  q_s10_r = vsubq_f32 (q_sin0_r, q_sin4_r);
566  q_s10_i = vsubq_f32 (q_sin0_i, q_sin4_i);
567  q_s11_r = vsubq_f32 (q_sin1_r, q_s5_r);
568  q_s11_i = vsubq_f32 (q_sin1_i, q_s5_i);
569 
570  // Combine the second set of pairs of these sums
571  q_s12_r = vaddq_f32 (q_sin2_r, q_sin6_r);
572  q_s12_i = vaddq_f32 (q_sin2_i, q_sin6_i);
573  q_s13_r = vaddq_f32 (q_s3_r, q_s7_r);
574  q_s13_i = vaddq_f32 (q_s3_i, q_s7_i);
575  q_s14_r = vsubq_f32 (q_sin2_r, q_sin6_r);
576  q_s14_i = vsubq_f32 (q_sin2_i, q_sin6_i);
577  q_s15_r = vsubq_f32 (q_s3_r, q_s7_r);
578  q_s15_i = vsubq_f32 (q_s3_i, q_s7_i);
579 
580  // Combine these combined components (for the full radix-8 butterfly)
581  q_out4_r = vsubq_f32 (q_s8_r, q_s12_r);
582  q_out4_i = vsubq_f32 (q_s8_i, q_s12_i);
583  q_out5_r = vsubq_f32 (q_s9_r, q_s13_r);
584  q_out5_i = vsubq_f32 (q_s9_i, q_s13_i);
585  q_out0_r = vaddq_f32 (q_s8_r, q_s12_r);
586  q_out0_i = vaddq_f32 (q_s8_i, q_s12_i);
587  q_out1_r = vaddq_f32 (q_s9_r, q_s13_r);
588  q_out1_i = vaddq_f32 (q_s9_i, q_s13_i);
589  q_out2_r = vaddq_f32 (q_s10_r, q_s14_i);
590  q_out2_i = vsubq_f32 (q_s10_i, q_s14_r);
591  q_out3_r = vaddq_f32 (q_s11_r, q_s15_i);
592  q_out3_i = vsubq_f32 (q_s11_i, q_s15_r);
593  q_out6_r = vsubq_f32 (q_s10_r, q_s14_i);
594  q_out6_i = vaddq_f32 (q_s10_i, q_s14_r);
595  q_out7_r = vsubq_f32 (q_s11_r, q_s15_i);
596  q_out7_i = vaddq_f32 (q_s11_i, q_s15_r);
597 
598  // Use VTRNQ and VCOMBINE to permute the vectors for contiguous stores. This is
599  // necessary in this first stage as we wish to store the related calculated values
600  // contiguously (as in the rolled pure C loop) even though they are not contiguous
601  // in the vectors (instead, values that are from separate loop iterations in the C
602  // version of the code are currently contiguous in vectors).
603  q2_tmp0 = vtrnq_f32 (q_out0_r, q_out1_r);
604  q2_tmp1 = vtrnq_f32 (q_out0_i, q_out1_i);
605  q2_tmp2 = vtrnq_f32 (q_out2_r, q_out3_r);
606  q2_tmp3 = vtrnq_f32 (q_out2_i, q_out3_i);
607  q2_tmp4 = vtrnq_f32 (q_out4_r, q_out5_r);
608  q2_tmp5 = vtrnq_f32 (q_out4_i, q_out5_i);
609  q2_tmp6 = vtrnq_f32 (q_out6_r, q_out7_r);
610  q2_tmp7 = vtrnq_f32 (q_out6_i, q_out7_i);
611  q2_out0.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[0]), vget_low_f32 (q2_tmp2.val[0]));
612  q2_out0.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[0]), vget_low_f32 (q2_tmp3.val[0]));
613  q2_out2.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[1]), vget_low_f32 (q2_tmp2.val[1]));
614  q2_out2.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[1]), vget_low_f32 (q2_tmp3.val[1]));
615  q2_out4.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[0]), vget_high_f32 (q2_tmp2.val[0]));
616  q2_out4.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[0]), vget_high_f32 (q2_tmp3.val[0]));
617  q2_out6.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[1]), vget_high_f32 (q2_tmp2.val[1]));
618  q2_out6.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[1]), vget_high_f32 (q2_tmp3.val[1]));
619  q2_out1.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp4.val[0]), vget_low_f32 (q2_tmp6.val[0]));
620  q2_out1.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp5.val[0]), vget_low_f32 (q2_tmp7.val[0]));
621  q2_out3.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp4.val[1]), vget_low_f32 (q2_tmp6.val[1]));
622  q2_out3.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp5.val[1]), vget_low_f32 (q2_tmp7.val[1]));
623  q2_out5.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp4.val[0]), vget_high_f32 (q2_tmp6.val[0]));
624  q2_out5.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp5.val[0]), vget_high_f32 (q2_tmp7.val[0]));
625  q2_out7.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp4.val[1]), vget_high_f32 (q2_tmp6.val[1]));
626  q2_out7.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp5.val[1]), vget_high_f32 (q2_tmp7.val[1]));
627 
628  // Store the results
629  vst2q_f32 (p_dst, q2_out0);
630  p_dst += 8;
631  vst2q_f32 (p_dst, q2_out1);
632  p_dst += 8;
633  vst2q_f32 (p_dst, q2_out2);
634  p_dst += 8;
635  vst2q_f32 (p_dst, q2_out3);
636  p_dst += 8;
637  vst2q_f32 (p_dst, q2_out4);
638  p_dst += 8;
639  vst2q_f32 (p_dst, q2_out5);
640  p_dst += 8;
641  vst2q_f32 (p_dst, q2_out6);
642  p_dst += 8;
643  vst2q_f32 (p_dst, q2_out7);
644  p_dst += 8;
645 
646  // Undo the p_src arithmetic from earlier in the loop, and add eight to skip past
647  // the float32 values that have been used within this loop iteration.
648  p_src = p_src - src_step * 8 + 8;
649  } // f_count
650 }
651 
652 static inline void ne10_radix4x4_without_twiddles_neon (ne10_fft_cpx_float32_t *out,
654  ne10_int32_t stride)
655 {
656  ne10_int32_t f_count;
657  ne10_int32_t src_step = stride << 1; // ne10_fft_cpx_float32_t -> float32_t offset
658  float32_t *p_src = (float32_t *) in;
659  float32_t *p_dst = (float32_t *) out;
660 
661  float32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3;
662  float32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i, q_s3_r, q_s3_i;
663  float32x4_t q_out0_r, q_out0_i, q_out1_r, q_out1_i, q_out2_r, q_out2_i, q_out3_r, q_out3_i;
664  float32x4x2_t q2_tmp0, q2_tmp1, q2_tmp2, q2_tmp3;
665  float32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3;
666 
667  // This loop is unrolled four times, taking 8 NEON quadword registers of input to
668  // process four radix-4 butterflies per loop iteration.
669  for (f_count = 0; f_count < stride; f_count += 4)
670  {
671  // Load the input values
672  q2_in0 = vld2q_f32 (p_src);
673  p_src += src_step;
674  q2_in1 = vld2q_f32 (p_src);
675  p_src += src_step;
676  q2_in2 = vld2q_f32 (p_src);
677  p_src += src_step;
678  q2_in3 = vld2q_f32 (p_src);
679  p_src += src_step;
680 
681  // Calculate sums for the butterfly calculations between the <X[0], X[2N/4]> and
682  // <X[N/4], X[3N/4]> components.
683  q_s0_r = vaddq_f32 (q2_in0.val[0], q2_in2.val[0]);
684  q_s0_i = vaddq_f32 (q2_in0.val[1], q2_in2.val[1]);
685  q_s1_r = vsubq_f32 (q2_in0.val[0], q2_in2.val[0]);
686  q_s1_i = vsubq_f32 (q2_in0.val[1], q2_in2.val[1]);
687  q_s2_r = vaddq_f32 (q2_in1.val[0], q2_in3.val[0]);
688  q_s2_i = vaddq_f32 (q2_in1.val[1], q2_in3.val[1]);
689  q_s3_r = vsubq_f32 (q2_in1.val[0], q2_in3.val[0]);
690  q_s3_i = vsubq_f32 (q2_in1.val[1], q2_in3.val[1]);
691 
692  // Combine these sums (for the full radix-4 butterfly)
693  q_out2_r = vsubq_f32 (q_s0_r, q_s2_r);
694  q_out2_i = vsubq_f32 (q_s0_i, q_s2_i);
695  q_out0_r = vaddq_f32 (q_s0_r, q_s2_r);
696  q_out0_i = vaddq_f32 (q_s0_i, q_s2_i);
697  q_out1_r = vaddq_f32 (q_s1_r, q_s3_i);
698  q_out1_i = vsubq_f32 (q_s1_i, q_s3_r);
699  q_out3_r = vsubq_f32 (q_s1_r, q_s3_i);
700  q_out3_i = vaddq_f32 (q_s1_i, q_s3_r);
701 
702  // Permute the vectors for contiguous stores
703  q2_tmp0 = vtrnq_f32 (q_out0_r, q_out1_r);
704  q2_tmp1 = vtrnq_f32 (q_out0_i, q_out1_i);
705  q2_tmp2 = vtrnq_f32 (q_out2_r, q_out3_r);
706  q2_tmp3 = vtrnq_f32 (q_out2_i, q_out3_i);
707  q2_out0.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[0]), vget_low_f32 (q2_tmp2.val[0]));
708  q2_out0.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[0]), vget_low_f32 (q2_tmp3.val[0]));
709  q2_out1.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[1]), vget_low_f32 (q2_tmp2.val[1]));
710  q2_out1.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[1]), vget_low_f32 (q2_tmp3.val[1]));
711  q2_out2.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[0]), vget_high_f32 (q2_tmp2.val[0]));
712  q2_out2.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[0]), vget_high_f32 (q2_tmp3.val[0]));
713  q2_out3.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[1]), vget_high_f32 (q2_tmp2.val[1]));
714  q2_out3.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[1]), vget_high_f32 (q2_tmp3.val[1]));
715 
716  // Store the results
717  vst2q_f32 (p_dst, q2_out0);
718  p_dst += 8;
719  vst2q_f32 (p_dst, q2_out1);
720  p_dst += 8;
721  vst2q_f32 (p_dst, q2_out2);
722  p_dst += 8;
723  vst2q_f32 (p_dst, q2_out3);
724  p_dst += 8;
725 
726  // Adjust p_src for the next loop iteration
727  p_src = p_src - src_step * 4 + 8;
728  } // f_count
729 }
730 
731 static inline void ne10_radix4x4_with_twiddles_neon (ne10_fft_cpx_float32_t *out,
734  ne10_int32_t src_stride,
735  ne10_int32_t dst_stride,
736  ne10_int32_t mstride)
737 {
738  ne10_int32_t m_count;
739  ne10_int32_t src_step = src_stride << 1; // ne10_fft_cpx_float32_t -> float32_t offsets
740  ne10_int32_t dst_step = dst_stride << 1;
741  ne10_int32_t tw_step = mstride << 1;
742  float32_t *p_src = (float32_t *) in;
743  float32_t *p_dst = (float32_t *) out;
744  float32_t *p_tw = (float32_t *) tw;
745 
746  float32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3;
747  float32x4x2_t q2_tw0, q2_tw1, q2_tw2;
748  float32x4_t q_s1_r, q_s1_i, q_s2_r, q_s2_i, q_s3_r, q_s3_i;
749  float32x4_t q_s4_r, q_s4_i, q_s5_r, q_s5_i, q_s6_r, q_s6_i, q_s7_r, q_s7_i;
750  float32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3;
751 
752  // This loop is unrolled four times, taking 8 NEON quadword registers of input to
753  // process four radix-4 butterflies per loop iteration.
754  for (m_count = 0; m_count < mstride; m_count += 4)
755  {
756  // Load the input values
757  q2_in0 = vld2q_f32 (p_src);
758  p_src += src_step;
759  q2_in1 = vld2q_f32 (p_src);
760  p_src += src_step;
761  q2_in2 = vld2q_f32 (p_src);
762  p_src += src_step;
763  q2_in3 = vld2q_f32 (p_src);
764  p_src += src_step;
765 
766  // Load the twiddles
767  q2_tw0 = vld2q_f32 (p_tw);
768  p_tw += tw_step;
769  q2_tw1 = vld2q_f32 (p_tw);
770  p_tw += tw_step;
771  q2_tw2 = vld2q_f32 (p_tw);
772 
773  // Multiply input elements by their associated twiddles
774  q_s1_r = vmulq_f32 (q2_in1.val[0], q2_tw0.val[0]);
775  q_s1_i = vmulq_f32 (q2_in1.val[1], q2_tw0.val[0]);
776  q_s2_r = vmulq_f32 (q2_in2.val[0], q2_tw1.val[0]);
777  q_s2_i = vmulq_f32 (q2_in2.val[1], q2_tw1.val[0]);
778  q_s3_r = vmulq_f32 (q2_in3.val[0], q2_tw2.val[0]);
779  q_s3_i = vmulq_f32 (q2_in3.val[1], q2_tw2.val[0]);
780  q_s1_r = vmlsq_f32 (q_s1_r, q2_in1.val[1], q2_tw0.val[1]);
781  q_s1_i = vmlaq_f32 (q_s1_i, q2_in1.val[0], q2_tw0.val[1]);
782  q_s2_r = vmlsq_f32 (q_s2_r, q2_in2.val[1], q2_tw1.val[1]);
783  q_s2_i = vmlaq_f32 (q_s2_i, q2_in2.val[0], q2_tw1.val[1]);
784  q_s3_r = vmlsq_f32 (q_s3_r, q2_in3.val[1], q2_tw2.val[1]);
785  q_s3_i = vmlaq_f32 (q_s3_i, q2_in3.val[0], q2_tw2.val[1]);
786 
787  // Calculate sums for the butterfly calculations between the <X[0], X[2N/4]> and
788  // <X[N/4], X[3N/4]> components.
789  q_s4_r = vaddq_f32 (q2_in0.val[0], q_s2_r);
790  q_s4_i = vaddq_f32 (q2_in0.val[1], q_s2_i);
791  q_s5_r = vsubq_f32 (q2_in0.val[0], q_s2_r);
792  q_s5_i = vsubq_f32 (q2_in0.val[1], q_s2_i);
793  q_s6_r = vaddq_f32 (q_s1_r, q_s3_r);
794  q_s6_i = vaddq_f32 (q_s1_i, q_s3_i);
795  q_s7_r = vsubq_f32 (q_s1_r, q_s3_r);
796  q_s7_i = vsubq_f32 (q_s1_i, q_s3_i);
797 
798  // Combine these sums (for the full radix-4 butterfly)
799  q2_out2.val[0] = vsubq_f32 (q_s4_r, q_s6_r);
800  q2_out2.val[1] = vsubq_f32 (q_s4_i, q_s6_i);
801  q2_out0.val[0] = vaddq_f32 (q_s4_r, q_s6_r);
802  q2_out0.val[1] = vaddq_f32 (q_s4_i, q_s6_i);
803  q2_out1.val[0] = vaddq_f32 (q_s5_r, q_s7_i);
804  q2_out1.val[1] = vsubq_f32 (q_s5_i, q_s7_r);
805  q2_out3.val[0] = vsubq_f32 (q_s5_r, q_s7_i);
806  q2_out3.val[1] = vaddq_f32 (q_s5_i, q_s7_r);
807 
808  // Store the results
809  vst2q_f32 (p_dst, q2_out0);
810  p_dst += dst_step;
811  vst2q_f32 (p_dst, q2_out1);
812  p_dst += dst_step;
813  vst2q_f32 (p_dst, q2_out2);
814  p_dst += dst_step;
815  vst2q_f32 (p_dst, q2_out3);
816  p_dst += dst_step;
817 
818  // Undo the arithmetic we did to these variables earlier in the loop, and add
819  // eight to skip past the float32 values processed within this loop iteration.
820  p_src = p_src - src_step * 4 + 8;
821  p_dst = p_dst - dst_step * 4 + 8;
822  p_tw = p_tw - tw_step * 2 + 8;
823  } // m_count
824 }
825 static inline void ne10_radix8x4_inverse_neon (ne10_fft_cpx_float32_t *out,
827  ne10_int32_t stride)
828 {
829  ne10_int32_t f_count;
830  ne10_int32_t src_step = stride << 1;
831  float32_t *p_src = (float32_t *) in;
832  float32_t *p_dst = (float32_t *) out;
833  const ne10_float32_t TW_81 = 0.70710678;
834  const ne10_float32_t TW_81N = -0.70710678;
835 
836  float32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3, q2_in4, q2_in5, q2_in6, q2_in7;
837  float32x4_t q_sin0_r, q_sin0_i, q_sin1_r, q_sin1_i, q_sin2_r, q_sin2_i, q_sin3_r, q_sin3_i;
838  float32x4_t q_sin4_r, q_sin4_i, q_sin5_r, q_sin5_i, q_sin6_r, q_sin6_i, q_sin7_r, q_sin7_i;
839  float32x4_t q_s3_r, q_s3_i, q_s5_r, q_s5_i, q_s7_r, q_s7_i;
840  float32x4_t q_s8_r, q_s8_i, q_s9_r, q_s9_i, q_s10_r, q_s10_i, q_s11_r, q_s11_i;
841  float32x4_t q_s12_r, q_s12_i, q_s13_r, q_s13_i, q_s14_r, q_s14_i, q_s15_r, q_s15_i;
842  float32x4_t q_out0_r, q_out0_i, q_out1_r, q_out1_i, q_out2_r, q_out2_i, q_out3_r, q_out3_i;
843  float32x4_t q_out4_r, q_out4_i, q_out5_r, q_out5_i, q_out6_r, q_out6_i, q_out7_r, q_out7_i;
844  float32x4x2_t q2_tmp0, q2_tmp1, q2_tmp2, q2_tmp3, q2_tmp4, q2_tmp5, q2_tmp6, q2_tmp7;
845  float32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3, q2_out4, q2_out5, q2_out6, q2_out7;
846  float32x4_t q_tw_81, q_tw_81n;
847 
848  // This loop is unrolled four times, taking 16 NEON quadword registers of input to
849  // process four radix-8 butterflies per loop iteration.
850  for (f_count = 0; f_count < stride; f_count += 4)
851  {
852  // Load the input values
853  q2_in0 = vld2q_f32 (p_src);
854  p_src += src_step;
855  q2_in2 = vld2q_f32 (p_src);
856  p_src += src_step;
857  q2_in4 = vld2q_f32 (p_src);
858  p_src += src_step;
859  q2_in6 = vld2q_f32 (p_src);
860  p_src += src_step;
861  q2_in1 = vld2q_f32 (p_src);
862  p_src += src_step;
863  q2_in3 = vld2q_f32 (p_src);
864  p_src += src_step;
865  q2_in5 = vld2q_f32 (p_src);
866  p_src += src_step;
867  q2_in7 = vld2q_f32 (p_src);
868  p_src += src_step;
869 
870  // Calculate sums for the butterfly calculations between the <X[0], X[4N/8]>,
871  // <X[N/8], X[5N/8]>, <X[2N/8], X[6N/8]>, and <X[3N/8], X[7N/8]> components.
872  q_sin0_r = vaddq_f32 (q2_in0.val[0], q2_in1.val[0]);
873  q_sin0_i = vaddq_f32 (q2_in0.val[1], q2_in1.val[1]);
874  q_sin1_r = vsubq_f32 (q2_in0.val[0], q2_in1.val[0]);
875  q_sin1_i = vsubq_f32 (q2_in0.val[1], q2_in1.val[1]);
876  q_sin2_r = vaddq_f32 (q2_in2.val[0], q2_in3.val[0]);
877  q_sin2_i = vaddq_f32 (q2_in2.val[1], q2_in3.val[1]);
878  q_sin3_r = vsubq_f32 (q2_in2.val[0], q2_in3.val[0]);
879  q_sin3_i = vsubq_f32 (q2_in2.val[1], q2_in3.val[1]);
880  q_sin4_r = vaddq_f32 (q2_in4.val[0], q2_in5.val[0]);
881  q_sin4_i = vaddq_f32 (q2_in4.val[1], q2_in5.val[1]);
882  q_sin5_r = vsubq_f32 (q2_in4.val[0], q2_in5.val[0]);
883  q_sin5_i = vsubq_f32 (q2_in4.val[1], q2_in5.val[1]);
884  q_sin6_r = vaddq_f32 (q2_in6.val[0], q2_in7.val[0]);
885  q_sin6_i = vaddq_f32 (q2_in6.val[1], q2_in7.val[1]);
886  q_sin7_r = vsubq_f32 (q2_in6.val[0], q2_in7.val[0]);
887  q_sin7_i = vsubq_f32 (q2_in6.val[1], q2_in7.val[1]);
888 
889  // Multiply some of these by hardcoded radix-8 twiddles
890  q_tw_81 = vdupq_n_f32 (TW_81);
891  q_tw_81n = vdupq_n_f32 (TW_81N);
892  q_s5_r = vnegq_f32 (q_sin5_i);
893  q_s5_i = q_sin5_r;
894  q_s3_r = vsubq_f32 (q_sin3_r, q_sin3_i);
895  q_s3_i = vaddq_f32 (q_sin3_i, q_sin3_r);
896  q_s7_r = vaddq_f32 (q_sin7_r, q_sin7_i);
897  q_s7_i = vsubq_f32 (q_sin7_i, q_sin7_r);
898  q_s3_r = vmulq_f32 (q_s3_r, q_tw_81);
899  q_s3_i = vmulq_f32 (q_s3_i, q_tw_81);
900  q_s7_r = vmulq_f32 (q_s7_r, q_tw_81n);
901  q_s7_i = vmulq_f32 (q_s7_i, q_tw_81n);
902 
903  // Combine the first set of pairs of these sums
904  q_s8_r = vaddq_f32 (q_sin0_r, q_sin4_r);
905  q_s8_i = vaddq_f32 (q_sin0_i, q_sin4_i);
906  q_s9_r = vaddq_f32 (q_sin1_r, q_s5_r);
907  q_s9_i = vaddq_f32 (q_sin1_i, q_s5_i);
908  q_s10_r = vsubq_f32 (q_sin0_r, q_sin4_r);
909  q_s10_i = vsubq_f32 (q_sin0_i, q_sin4_i);
910  q_s11_r = vsubq_f32 (q_sin1_r, q_s5_r);
911  q_s11_i = vsubq_f32 (q_sin1_i, q_s5_i);
912 
913  // Combine the second set of pairs of these sums
914  q_s12_r = vaddq_f32 (q_sin2_r, q_sin6_r);
915  q_s12_i = vaddq_f32 (q_sin2_i, q_sin6_i);
916  q_s13_r = vaddq_f32 (q_s3_r, q_s7_r);
917  q_s13_i = vaddq_f32 (q_s3_i, q_s7_i);
918  q_s14_r = vsubq_f32 (q_sin2_r, q_sin6_r);
919  q_s14_i = vsubq_f32 (q_sin2_i, q_sin6_i);
920  q_s15_r = vsubq_f32 (q_s3_r, q_s7_r);
921  q_s15_i = vsubq_f32 (q_s3_i, q_s7_i);
922 
923  // Combine these combined components (for the full radix-8 butterfly)
924  q_out4_r = vsubq_f32 (q_s8_r, q_s12_r);
925  q_out4_i = vsubq_f32 (q_s8_i, q_s12_i);
926  q_out5_r = vsubq_f32 (q_s9_r, q_s13_r);
927  q_out5_i = vsubq_f32 (q_s9_i, q_s13_i);
928  q_out0_r = vaddq_f32 (q_s8_r, q_s12_r);
929  q_out0_i = vaddq_f32 (q_s8_i, q_s12_i);
930  q_out1_r = vaddq_f32 (q_s9_r, q_s13_r);
931  q_out1_i = vaddq_f32 (q_s9_i, q_s13_i);
932  q_out2_r = vsubq_f32 (q_s10_r, q_s14_i);
933  q_out2_i = vaddq_f32 (q_s10_i, q_s14_r);
934  q_out3_r = vsubq_f32 (q_s11_r, q_s15_i);
935  q_out3_i = vaddq_f32 (q_s11_i, q_s15_r);
936  q_out6_r = vaddq_f32 (q_s10_r, q_s14_i);
937  q_out6_i = vsubq_f32 (q_s10_i, q_s14_r);
938  q_out7_r = vaddq_f32 (q_s11_r, q_s15_i);
939  q_out7_i = vsubq_f32 (q_s11_i, q_s15_r);
940 
941  // Permute the vectors for contiguous stores
942  q2_tmp0 = vtrnq_f32 (q_out0_r, q_out1_r);
943  q2_tmp1 = vtrnq_f32 (q_out0_i, q_out1_i);
944  q2_tmp2 = vtrnq_f32 (q_out2_r, q_out3_r);
945  q2_tmp3 = vtrnq_f32 (q_out2_i, q_out3_i);
946  q2_tmp4 = vtrnq_f32 (q_out4_r, q_out5_r);
947  q2_tmp5 = vtrnq_f32 (q_out4_i, q_out5_i);
948  q2_tmp6 = vtrnq_f32 (q_out6_r, q_out7_r);
949  q2_tmp7 = vtrnq_f32 (q_out6_i, q_out7_i);
950  q2_out0.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[0]), vget_low_f32 (q2_tmp2.val[0]));
951  q2_out0.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[0]), vget_low_f32 (q2_tmp3.val[0]));
952  q2_out2.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[1]), vget_low_f32 (q2_tmp2.val[1]));
953  q2_out2.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[1]), vget_low_f32 (q2_tmp3.val[1]));
954  q2_out4.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[0]), vget_high_f32 (q2_tmp2.val[0]));
955  q2_out4.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[0]), vget_high_f32 (q2_tmp3.val[0]));
956  q2_out6.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[1]), vget_high_f32 (q2_tmp2.val[1]));
957  q2_out6.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[1]), vget_high_f32 (q2_tmp3.val[1]));
958  q2_out1.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp4.val[0]), vget_low_f32 (q2_tmp6.val[0]));
959  q2_out1.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp5.val[0]), vget_low_f32 (q2_tmp7.val[0]));
960  q2_out3.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp4.val[1]), vget_low_f32 (q2_tmp6.val[1]));
961  q2_out3.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp5.val[1]), vget_low_f32 (q2_tmp7.val[1]));
962  q2_out5.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp4.val[0]), vget_high_f32 (q2_tmp6.val[0]));
963  q2_out5.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp5.val[0]), vget_high_f32 (q2_tmp7.val[0]));
964  q2_out7.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp4.val[1]), vget_high_f32 (q2_tmp6.val[1]));
965  q2_out7.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp5.val[1]), vget_high_f32 (q2_tmp7.val[1]));
966 
967  // Store the results
968  vst2q_f32 (p_dst, q2_out0);
969  p_dst += 8;
970  vst2q_f32 (p_dst, q2_out1);
971  p_dst += 8;
972  vst2q_f32 (p_dst, q2_out2);
973  p_dst += 8;
974  vst2q_f32 (p_dst, q2_out3);
975  p_dst += 8;
976  vst2q_f32 (p_dst, q2_out4);
977  p_dst += 8;
978  vst2q_f32 (p_dst, q2_out5);
979  p_dst += 8;
980  vst2q_f32 (p_dst, q2_out6);
981  p_dst += 8;
982  vst2q_f32 (p_dst, q2_out7);
983  p_dst += 8;
984 
985  // Adjust p_src for the next loop iteration
986  p_src = p_src - src_step * 8 + 8;
987  } // f_count
988 }
989 
990 static inline void ne10_radix4x4_inverse_without_twiddles_neon (ne10_fft_cpx_float32_t *out,
992  ne10_int32_t stride)
993 {
994  ne10_int32_t f_count;
995  ne10_int32_t src_step = stride << 1;
996  float32_t *p_src = (float32_t *) in;
997  float32_t *p_dst = (float32_t *) out;
998 
999  float32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3;
1000  float32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i, q_s3_r, q_s3_i;
1001  float32x4_t q_out0_r, q_out0_i, q_out1_r, q_out1_i, q_out2_r, q_out2_i, q_out3_r, q_out3_i;
1002  float32x4x2_t q2_tmp0, q2_tmp1, q2_tmp2, q2_tmp3;
1003  float32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3;
1004 
1005  // This loop is unrolled four times, taking 8 NEON quadword registers of input to
1006  // process four radix-4 butterflies per loop iteration.
1007  for (f_count = 0; f_count < stride; f_count += 4)
1008  {
1009  // Load the input values
1010  q2_in0 = vld2q_f32 (p_src);
1011  p_src += src_step;
1012  q2_in1 = vld2q_f32 (p_src);
1013  p_src += src_step;
1014  q2_in2 = vld2q_f32 (p_src);
1015  p_src += src_step;
1016  q2_in3 = vld2q_f32 (p_src);
1017  p_src += src_step;
1018 
1019  // Calculate sums for the butterfly calculations between the <X[0], X[2N/4]> and
1020  // <X[N/4], X[3N/4]> components.
1021  q_s0_r = vaddq_f32 (q2_in0.val[0], q2_in2.val[0]);
1022  q_s0_i = vaddq_f32 (q2_in0.val[1], q2_in2.val[1]);
1023  q_s1_r = vsubq_f32 (q2_in0.val[0], q2_in2.val[0]);
1024  q_s1_i = vsubq_f32 (q2_in0.val[1], q2_in2.val[1]);
1025  q_s2_r = vaddq_f32 (q2_in1.val[0], q2_in3.val[0]);
1026  q_s2_i = vaddq_f32 (q2_in1.val[1], q2_in3.val[1]);
1027  q_s3_r = vsubq_f32 (q2_in1.val[0], q2_in3.val[0]);
1028  q_s3_i = vsubq_f32 (q2_in1.val[1], q2_in3.val[1]);
1029 
1030  // Combine these sums (for the full radix-4 butterfly)
1031  q_out2_r = vsubq_f32 (q_s0_r, q_s2_r);
1032  q_out2_i = vsubq_f32 (q_s0_i, q_s2_i);
1033  q_out0_r = vaddq_f32 (q_s0_r, q_s2_r);
1034  q_out0_i = vaddq_f32 (q_s0_i, q_s2_i);
1035  q_out1_r = vsubq_f32 (q_s1_r, q_s3_i);
1036  q_out1_i = vaddq_f32 (q_s1_i, q_s3_r);
1037  q_out3_r = vaddq_f32 (q_s1_r, q_s3_i);
1038  q_out3_i = vsubq_f32 (q_s1_i, q_s3_r);
1039 
1040  // Permute the vectors for contiguous stores
1041  q2_tmp0 = vtrnq_f32 (q_out0_r, q_out1_r);
1042  q2_tmp1 = vtrnq_f32 (q_out0_i, q_out1_i);
1043  q2_tmp2 = vtrnq_f32 (q_out2_r, q_out3_r);
1044  q2_tmp3 = vtrnq_f32 (q_out2_i, q_out3_i);
1045  q2_out0.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[0]), vget_low_f32 (q2_tmp2.val[0]));
1046  q2_out0.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[0]), vget_low_f32 (q2_tmp3.val[0]));
1047  q2_out1.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[1]), vget_low_f32 (q2_tmp2.val[1]));
1048  q2_out1.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[1]), vget_low_f32 (q2_tmp3.val[1]));
1049  q2_out2.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[0]), vget_high_f32 (q2_tmp2.val[0]));
1050  q2_out2.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[0]), vget_high_f32 (q2_tmp3.val[0]));
1051  q2_out3.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[1]), vget_high_f32 (q2_tmp2.val[1]));
1052  q2_out3.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[1]), vget_high_f32 (q2_tmp3.val[1]));
1053 
1054  // Store the results
1055  vst2q_f32 (p_dst, q2_out0);
1056  p_dst += 8;
1057  vst2q_f32 (p_dst, q2_out1);
1058  p_dst += 8;
1059  vst2q_f32 (p_dst, q2_out2);
1060  p_dst += 8;
1061  vst2q_f32 (p_dst, q2_out3);
1062  p_dst += 8;
1063 
1064  // Adjust p_src for the next loop iteration
1065  p_src = p_src - src_step * 4 + 8;
1066  } // f_count
1067 }
1068 
1069 static inline void ne10_radix4x4_inverse_with_twiddles_neon (ne10_fft_cpx_float32_t *out,
1072  ne10_int32_t src_stride,
1073  ne10_int32_t dst_stride,
1074  ne10_int32_t mstride)
1075 {
1076  ne10_int32_t m_count;
1077  ne10_int32_t src_step = src_stride << 1;
1078  ne10_int32_t dst_step = dst_stride << 1;
1079  ne10_int32_t tw_step = mstride << 1;
1080  float32_t *p_src = (float32_t *) in;
1081  float32_t *p_dst = (float32_t *) out;
1082  float32_t *p_tw = (float32_t *) tw;
1083 
1084  float32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3;
1085  float32x4x2_t q2_tw0, q2_tw1, q2_tw2;
1086  float32x4_t q_s1_r, q_s1_i, q_s2_r, q_s2_i, q_s3_r, q_s3_i;
1087  float32x4_t q_s4_r, q_s4_i, q_s5_r, q_s5_i, q_s6_r, q_s6_i, q_s7_r, q_s7_i;
1088  float32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3;
1089 
1090  // This loop is unrolled four times, taking 8 NEON quadword registers of input to
1091  // process four radix-4 butterflies per loop iteration.
1092  for (m_count = 0; m_count < mstride; m_count += 4)
1093  {
1094  // Load the input values
1095  q2_in0 = vld2q_f32 (p_src);
1096  p_src += src_step;
1097  q2_in1 = vld2q_f32 (p_src);
1098  p_src += src_step;
1099  q2_in2 = vld2q_f32 (p_src);
1100  p_src += src_step;
1101  q2_in3 = vld2q_f32 (p_src);
1102  p_src += src_step;
1103 
1104  // Load the twiddles
1105  q2_tw0 = vld2q_f32 (p_tw);
1106  p_tw += tw_step;
1107  q2_tw1 = vld2q_f32 (p_tw);
1108  p_tw += tw_step;
1109  q2_tw2 = vld2q_f32 (p_tw);
1110 
1111  // Multiply input elements by their associated twiddles
1112  q_s1_r = vmulq_f32 (q2_in1.val[0], q2_tw0.val[0]);
1113  q_s1_i = vmulq_f32 (q2_in1.val[1], q2_tw0.val[0]);
1114  q_s2_r = vmulq_f32 (q2_in2.val[0], q2_tw1.val[0]);
1115  q_s2_i = vmulq_f32 (q2_in2.val[1], q2_tw1.val[0]);
1116  q_s3_r = vmulq_f32 (q2_in3.val[0], q2_tw2.val[0]);
1117  q_s3_i = vmulq_f32 (q2_in3.val[1], q2_tw2.val[0]);
1118  q_s1_r = vmlaq_f32 (q_s1_r, q2_in1.val[1], q2_tw0.val[1]);
1119  q_s1_i = vmlsq_f32 (q_s1_i, q2_in1.val[0], q2_tw0.val[1]);
1120  q_s2_r = vmlaq_f32 (q_s2_r, q2_in2.val[1], q2_tw1.val[1]);
1121  q_s2_i = vmlsq_f32 (q_s2_i, q2_in2.val[0], q2_tw1.val[1]);
1122  q_s3_r = vmlaq_f32 (q_s3_r, q2_in3.val[1], q2_tw2.val[1]);
1123  q_s3_i = vmlsq_f32 (q_s3_i, q2_in3.val[0], q2_tw2.val[1]);
1124 
1125  // Calculate sums for the butterfly calculations between the <X[0], X[2N/4]> and
1126  // <X[N/4], X[3N/4]> components.
1127  q_s4_r = vaddq_f32 (q2_in0.val[0], q_s2_r);
1128  q_s4_i = vaddq_f32 (q2_in0.val[1], q_s2_i);
1129  q_s5_r = vsubq_f32 (q2_in0.val[0], q_s2_r);
1130  q_s5_i = vsubq_f32 (q2_in0.val[1], q_s2_i);
1131  q_s6_r = vaddq_f32 (q_s1_r, q_s3_r);
1132  q_s6_i = vaddq_f32 (q_s1_i, q_s3_i);
1133  q_s7_r = vsubq_f32 (q_s1_r, q_s3_r);
1134  q_s7_i = vsubq_f32 (q_s1_i, q_s3_i);
1135 
1136  // Combine these sums (for the full radix-4 butterfly)
1137  q2_out2.val[0] = vsubq_f32 (q_s4_r, q_s6_r);
1138  q2_out2.val[1] = vsubq_f32 (q_s4_i, q_s6_i);
1139  q2_out0.val[0] = vaddq_f32 (q_s4_r, q_s6_r);
1140  q2_out0.val[1] = vaddq_f32 (q_s4_i, q_s6_i);
1141  q2_out1.val[0] = vsubq_f32 (q_s5_r, q_s7_i);
1142  q2_out1.val[1] = vaddq_f32 (q_s5_i, q_s7_r);
1143  q2_out3.val[0] = vaddq_f32 (q_s5_r, q_s7_i);
1144  q2_out3.val[1] = vsubq_f32 (q_s5_i, q_s7_r);
1145 
1146  // Store the results
1147  vst2q_f32 (p_dst, q2_out0);
1148  p_dst += dst_step;
1149  vst2q_f32 (p_dst, q2_out1);
1150  p_dst += dst_step;
1151  vst2q_f32 (p_dst, q2_out2);
1152  p_dst += dst_step;
1153  vst2q_f32 (p_dst, q2_out3);
1154  p_dst += dst_step;
1155 
1156  // Adjust p_src, p_dst, p_tw for the next loop iteration
1157  p_src = p_src - src_step * 4 + 8;
1158  p_dst = p_dst - dst_step * 4 + 8;
1159  p_tw = p_tw - tw_step * 2 + 8;
1160  } // m_count
1161 }
1162 
1163 static inline void ne10_radix4x4_inverse_with_twiddles_last_stage_neon (ne10_fft_cpx_float32_t *out,
1166  ne10_int32_t src_stride,
1167  ne10_int32_t dst_stride,
1168  ne10_int32_t mstride,
1169  ne10_int32_t nfft)
1170 {
1171  ne10_int32_t m_count;
1172  ne10_int32_t src_step = src_stride << 1;
1173  ne10_int32_t dst_step = dst_stride << 1;
1174  ne10_int32_t tw_step = mstride << 1;
1175  float32_t *p_src = (float32_t *) in;
1176  float32_t *p_dst = (float32_t *) out;
1177  float32_t *p_tw = (float32_t *) tw;
1178  ne10_float32_t one_by_nfft = (1.0f / (ne10_float32_t) nfft);
1179 
1180  float32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3;
1181  float32x4x2_t q2_tw0, q2_tw1, q2_tw2;
1182  float32x4_t q_s1_r, q_s1_i, q_s2_r, q_s2_i, q_s3_r, q_s3_i;
1183  float32x4_t q_s4_r, q_s4_i, q_s5_r, q_s5_i, q_s6_r, q_s6_i, q_s7_r, q_s7_i;
1184  float32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3;
1185  float32x4_t q_one_by_nfft = vdupq_n_f32 (one_by_nfft);
1186 
1187  // This loop is unrolled four times, taking 8 NEON quadword registers of input to
1188  // process four radix-4 butterflies per loop iteration.
1189  for (m_count = 0; m_count < mstride; m_count += 4)
1190  {
1191  // Load the input values
1192  q2_in0 = vld2q_f32 (p_src);
1193  p_src += src_step;
1194  q2_in1 = vld2q_f32 (p_src);
1195  p_src += src_step;
1196  q2_in2 = vld2q_f32 (p_src);
1197  p_src += src_step;
1198  q2_in3 = vld2q_f32 (p_src);
1199  p_src += src_step;
1200 
1201  // Load the twiddles
1202  q2_tw0 = vld2q_f32 (p_tw);
1203  p_tw += tw_step;
1204  q2_tw1 = vld2q_f32 (p_tw);
1205  p_tw += tw_step;
1206  q2_tw2 = vld2q_f32 (p_tw);
1207 
1208  // Multiply input elements by their associated twiddles
1209  q_s1_r = vmulq_f32 (q2_in1.val[0], q2_tw0.val[0]);
1210  q_s1_i = vmulq_f32 (q2_in1.val[1], q2_tw0.val[0]);
1211  q_s2_r = vmulq_f32 (q2_in2.val[0], q2_tw1.val[0]);
1212  q_s2_i = vmulq_f32 (q2_in2.val[1], q2_tw1.val[0]);
1213  q_s3_r = vmulq_f32 (q2_in3.val[0], q2_tw2.val[0]);
1214  q_s3_i = vmulq_f32 (q2_in3.val[1], q2_tw2.val[0]);
1215  q_s1_r = vmlaq_f32 (q_s1_r, q2_in1.val[1], q2_tw0.val[1]);
1216  q_s1_i = vmlsq_f32 (q_s1_i, q2_in1.val[0], q2_tw0.val[1]);
1217  q_s2_r = vmlaq_f32 (q_s2_r, q2_in2.val[1], q2_tw1.val[1]);
1218  q_s2_i = vmlsq_f32 (q_s2_i, q2_in2.val[0], q2_tw1.val[1]);
1219  q_s3_r = vmlaq_f32 (q_s3_r, q2_in3.val[1], q2_tw2.val[1]);
1220  q_s3_i = vmlsq_f32 (q_s3_i, q2_in3.val[0], q2_tw2.val[1]);
1221 
1222  // Calculate sums for the butterfly calculations between the <X[0], X[2N/4]> and
1223  // <X[N/4], X[3N/4]> components.
1224  q_s4_r = vaddq_f32 (q2_in0.val[0], q_s2_r);
1225  q_s4_i = vaddq_f32 (q2_in0.val[1], q_s2_i);
1226  q_s5_r = vsubq_f32 (q2_in0.val[0], q_s2_r);
1227  q_s5_i = vsubq_f32 (q2_in0.val[1], q_s2_i);
1228  q_s6_r = vaddq_f32 (q_s1_r, q_s3_r);
1229  q_s6_i = vaddq_f32 (q_s1_i, q_s3_i);
1230  q_s7_r = vsubq_f32 (q_s1_r, q_s3_r);
1231  q_s7_i = vsubq_f32 (q_s1_i, q_s3_i);
1232 
1233  // Combine these sums (for the full radix-4 butterfly)
1234  q2_out2.val[0] = vsubq_f32 (q_s4_r, q_s6_r);
1235  q2_out2.val[1] = vsubq_f32 (q_s4_i, q_s6_i);
1236  q2_out0.val[0] = vaddq_f32 (q_s4_r, q_s6_r);
1237  q2_out0.val[1] = vaddq_f32 (q_s4_i, q_s6_i);
1238  q2_out1.val[0] = vsubq_f32 (q_s5_r, q_s7_i);
1239  q2_out1.val[1] = vaddq_f32 (q_s5_i, q_s7_r);
1240  q2_out3.val[0] = vaddq_f32 (q_s5_r, q_s7_i);
1241  q2_out3.val[1] = vsubq_f32 (q_s5_i, q_s7_r);
1242 
1243  // Multiply by (1 / nfft)
1244  q2_out0.val[0] = vmulq_f32 (q2_out0.val[0], q_one_by_nfft);
1245  q2_out0.val[1] = vmulq_f32 (q2_out0.val[1], q_one_by_nfft);
1246  q2_out1.val[0] = vmulq_f32 (q2_out1.val[0], q_one_by_nfft);
1247  q2_out1.val[1] = vmulq_f32 (q2_out1.val[1], q_one_by_nfft);
1248  q2_out2.val[0] = vmulq_f32 (q2_out2.val[0], q_one_by_nfft);
1249  q2_out2.val[1] = vmulq_f32 (q2_out2.val[1], q_one_by_nfft);
1250  q2_out3.val[0] = vmulq_f32 (q2_out3.val[0], q_one_by_nfft);
1251  q2_out3.val[1] = vmulq_f32 (q2_out3.val[1], q_one_by_nfft);
1252 
1253  // Store the results
1254  vst2q_f32 (p_dst, q2_out0);
1255  p_dst += dst_step;
1256  vst2q_f32 (p_dst, q2_out1);
1257  p_dst += dst_step;
1258  vst2q_f32 (p_dst, q2_out2);
1259  p_dst += dst_step;
1260  vst2q_f32 (p_dst, q2_out3);
1261  p_dst += dst_step;
1262 
1263  // Adjust p_src, p_dst, p_tw for the next loop iteration
1264  p_src = p_src - src_step * 4 + 8;
1265  p_dst = p_dst - dst_step * 4 + 8;
1266  p_tw = p_tw - tw_step * 2 + 8;
1267  } // m_count
1268 }
1269 
1270 /*
1271  * This function calculates the FFT for power-of-two input sizes using an ordered, mixed
1272  * radix-4/8 DIT algorithm. It is very similar to its C counterpart in NE10_fft_float32.c,
1273  * "ne10_mixed_radix_butterfly_float32_c".
1274  */
1277  ne10_int32_t *factors,
1278  ne10_fft_cpx_float32_t *twiddles,
1279  ne10_fft_cpx_float32_t *buffer)
1280 {
1281  ne10_int32_t stage_count = factors[0];
1282  ne10_int32_t fstride = factors[1];
1283  ne10_int32_t mstride = factors[(stage_count << 1) - 1];
1284  ne10_int32_t first_radix = factors[stage_count << 1];
1285  ne10_int32_t step, f_count;
1286  ne10_fft_cpx_float32_t *src = in;
1287  ne10_fft_cpx_float32_t *dst = out;
1288  ne10_fft_cpx_float32_t *out_final = out;
1289  ne10_fft_cpx_float32_t *tw, *tmp;
1290 
1291  // The first stage (using hardcoded twiddles)
1292  if (first_radix == 8) // For our factoring, this means nfft is of form 2^{odd}
1293  {
1294  ne10_radix8x4_neon (dst, src, fstride);
1295 
1296  // Update variables for the next stages
1297  step = fstride << 1; // For C2C, 1/4 of input size (fstride is nfft/8)
1298  stage_count--;
1299  fstride /= 4;
1300  }
1301  else if (first_radix == 4) // For our factoring, this means nfft is of form 2^{even}
1302  {
1303  ne10_radix4x4_without_twiddles_neon (dst, src, fstride);
1304 
1305  // Update variables for the next stages
1306  step = fstride; // For C2C, 1/4 of input size (fstride is nfft/4)
1307  stage_count--;
1308  fstride /= 4;
1309  }
1310 
1311  // The next stage should read the output of the first stage as input
1312  in = out;
1313  out = buffer;
1314 
1315  // Middle stages (after the first, excluding the last)
1316  for (; stage_count > 1 ; stage_count--)
1317  {
1318  src = in;
1319  for (f_count = 0; f_count < fstride; f_count ++)
1320  {
1321  dst = &out[f_count * (mstride * 4)];
1322  tw = twiddles; // Reset the twiddle pointer for the next section
1323  ne10_radix4x4_with_twiddles_neon (dst, src, tw, step, mstride, mstride);
1324  src += mstride;
1325  } // f_count
1326 
1327  // Update variables for the next stages
1328  twiddles += mstride * 3;
1329  mstride *= 4;
1330  fstride /= 4;
1331 
1332  // Swap the input and output buffers for the next stage
1333  tmp = in;
1334  in = out;
1335  out = tmp;
1336  } // stage_count
1337 
1338  // The last stage
1339  if (stage_count)
1340  {
1341  src = in;
1342 
1343  // Always write to the final output buffer (if necessary, we can calculate this
1344  // in-place as the final stage reads and writes at the same offsets)
1345  dst = out_final;
1346 
1347  for (f_count = 0; f_count < fstride; f_count++)
1348  {
1349  tw = twiddles; // Reset the twiddle pointer for the next section
1350  ne10_radix4x4_with_twiddles_neon (dst, src, tw, step, step, mstride);
1351  src += mstride;
1352  dst += mstride;
1353  } // f_count
1354  } // last stage
1355 }
1356 
1357 /*
1358  * This function calculates the inverse FFT, and is very similar in structure to its
1359  * complement "ne10_mixed_radix_fft_forward_float32_neon".
1360  */
1363  ne10_int32_t *factors,
1364  ne10_fft_cpx_float32_t *twiddles,
1365  ne10_fft_cpx_float32_t *buffer)
1366 {
1367  ne10_int32_t stage_count = factors[0];
1368  ne10_int32_t fstride = factors[1];
1369  ne10_int32_t mstride = factors[(stage_count << 1) - 1];
1370  ne10_int32_t first_radix = factors[stage_count << 1];
1371  ne10_float32_t nfft = fstride * first_radix;
1372  ne10_int32_t step, f_count;
1373  ne10_fft_cpx_float32_t *src = in;
1374  ne10_fft_cpx_float32_t *dst = out;
1375  ne10_fft_cpx_float32_t *out_final = out;
1376  ne10_fft_cpx_float32_t *tw, *tmp;
1377 
1378  // The first stage (using hardcoded twiddles)
1379  if (first_radix == 8) // nfft is of form 2^{odd}
1380  {
1381  ne10_radix8x4_inverse_neon (dst, src, fstride);
1382 
1383  // Update variables for the next stages
1384  step = fstride << 1;
1385  stage_count--;
1386  fstride /= 4;
1387  }
1388  else if (first_radix == 4) // nfft is of form 2^{even}
1389  {
1390  ne10_radix4x4_inverse_without_twiddles_neon (dst, src, fstride);
1391 
1392  // Update variables for the next stages
1393  step = fstride;
1394  stage_count--;
1395  fstride /= 4;
1396  }
1397 
1398  // The next stage should read the output of the first stage as input
1399  in = out;
1400  out = buffer;
1401 
1402  // Middle stages (after the first, excluding the last)
1403  for (; stage_count > 1; stage_count--)
1404  {
1405  src = in;
1406  for (f_count = 0; f_count < fstride; f_count++)
1407  {
1408  dst = &out[f_count * (mstride * 4)];
1409  tw = twiddles;
1410  ne10_radix4x4_inverse_with_twiddles_neon (dst, src, tw, step, mstride, mstride);
1411  src += mstride;
1412  } // f_count
1413 
1414  // Update variables for the next stages
1415  twiddles += mstride * 3;
1416  mstride *= 4;
1417  fstride /= 4;
1418 
1419  // Swap the input and output buffers for the next stage
1420  tmp = in;
1421  in = out;
1422  out = tmp;
1423  } // stage_count
1424 
1425  // The last stage
1426  if (stage_count)
1427  {
1428  src = in;
1429 
1430  // Always write to the final output buffer (if necessary, we can calculate this
1431  // in-place as the final stage reads and writes at the same offsets)
1432  dst = out_final;
1433 
1434  for (f_count = 0; f_count < fstride; f_count++)
1435  {
1436  tw = twiddles;
1437  ne10_radix4x4_inverse_with_twiddles_last_stage_neon (dst, src, tw, step, step, mstride, nfft);
1438  src += mstride;
1439  dst += mstride;
1440  } // f_count
1441  } // last stage
1442 }
1443 
1453  ne10_int32_t inverse_fft)
1454 {
1455  // For input shorter than 15, fall back to c version.
1456  // We would not get much improvement from NEON for these cases.
1457  if (cfg->nfft < 15)
1458  {
1459  ne10_fft_c2c_1d_float32_c (fout, fin, cfg, inverse_fft);
1460  return;
1461  }
1462 
1463  ne10_int32_t stage_count = cfg->factors[0];
1464  ne10_int32_t algorithm_flag = cfg->factors[2 * (stage_count + 1)];
1465 
1466  assert ((algorithm_flag == NE10_FFT_ALG_DEFAULT)
1467  || (algorithm_flag == NE10_FFT_ALG_ANY));
1468 
1469  // For NE10_FFT_ALG_ANY.
1470  // Function will return inside this branch.
1471  if (algorithm_flag == NE10_FFT_ALG_ANY)
1472  {
1473  if (inverse_fft)
1474  {
1476  cfg->factors, cfg->twiddles, cfg->buffer, cfg->is_backward_scaled);
1477  }
1478  else
1479  {
1481  cfg->factors, cfg->twiddles, cfg->buffer, cfg->is_forward_scaled);
1482  }
1483  return;
1484  }
1485 
1486  // Since function goes pass assertion and skips branch above, algorithm_flag
1487  // must be NE10_FFT_ALG_DEFAULT.
1488  if (inverse_fft)
1489  {
1490  switch (cfg->nfft)
1491  {
1492  case 4:
1493  ne10_fft4_backward_float32 (fout, fin);
1494  break;
1495  case 8:
1496  ne10_fft8_backward_float32 (fout, fin);
1497  break;
1498  case 16:
1499  ne10_fft16_backward_float32_neon (fout, fin, cfg->twiddles);
1500  break;
1501  default:
1502  ne10_mixed_radix_fft_backward_float32_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1503  break;
1504  }
1505  }
1506  else
1507  {
1508  switch (cfg->nfft)
1509  {
1510  case 4:
1511  ne10_fft4_forward_float32 (fout, fin);
1512  break;
1513  case 8:
1514  ne10_fft8_forward_float32 (fout, fin);
1515  break;
1516  case 16:
1517  ne10_fft16_forward_float32_neon (fout, fin, cfg->twiddles);
1518  break;
1519  default:
1520  ne10_mixed_radix_fft_forward_float32_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1521  break;
1522  }
1523  }
1524 }
#define NE10_FFT_ALG_DEFAULT
Definition: NE10_fft.h:57
ne10_int32_t is_backward_scaled
Flag to control scaling behaviour in backward floating point complex FFT.
Definition: NE10_types.h:261
void ne10_mixed_radix_generic_butterfly_float32_neon(ne10_fft_cpx_float32_t *Fout, const ne10_fft_cpx_float32_t *Fin, const ne10_int32_t *factors, const ne10_fft_cpx_float32_t *twiddles, ne10_fft_cpx_float32_t *buffer, const ne10_int32_t is_scaled)
int32_t ne10_int32_t
Definition: NE10_types.h:76
void ne10_fft_c2c_1d_float32_neon(ne10_fft_cpx_float32_t *fout, ne10_fft_cpx_float32_t *fin, ne10_fft_cfg_float32_t cfg, ne10_int32_t inverse_fft)
Specific implementation of ne10_fft_c2c_1d_float32 using NEON SIMD capabilities.
float ne10_float32_t
Definition: NE10_types.h:80
Structure for the floating point FFT state.
Definition: NE10_types.h:239
ne10_int32_t * factors
Definition: NE10_types.h:242
ne10_fft_cpx_float32_t * twiddles
Definition: NE10_types.h:243
void ne10_mixed_radix_generic_butterfly_inverse_float32_neon(ne10_fft_cpx_float32_t *Fout, const ne10_fft_cpx_float32_t *Fin, const ne10_int32_t *factors, const ne10_fft_cpx_float32_t *twiddles, ne10_fft_cpx_float32_t *buffer, const ne10_int32_t is_scaled)
void ne10_fft_c2c_1d_float32_c(ne10_fft_cpx_float32_t *fout, ne10_fft_cpx_float32_t *fin, ne10_fft_cfg_float32_t cfg, ne10_int32_t inverse_fft)
Specific implementation of ne10_fft_c2c_1d_float32 using plain C.
#define NE10_FFT_ALG_ANY
Definition: NE10_fft.h:58
ne10_float32_t i
Definition: NE10_types.h:233
void ne10_mixed_radix_fft_backward_float32_neon(ne10_fft_cpx_float32_t *out, ne10_fft_cpx_float32_t *in, ne10_int32_t *factors, ne10_fft_cpx_float32_t *twiddles, ne10_fft_cpx_float32_t *buffer)
ne10_fft_cpx_float32_t * buffer
Definition: NE10_types.h:244
void ne10_mixed_radix_fft_forward_float32_neon(ne10_fft_cpx_float32_t *out, ne10_fft_cpx_float32_t *in, ne10_int32_t *factors, ne10_fft_cpx_float32_t *twiddles, ne10_fft_cpx_float32_t *buffer)
ne10_int32_t is_forward_scaled
Flag to control scaling behaviour in forward floating point complex FFT.
Definition: NE10_types.h:253
ne10_float32_t r
Definition: NE10_types.h:232