46 tmp_r = in[0].
r + in[2].
r;
47 tmp_i = in[0].
i + in[2].
i;
48 s2_r = in[0].
r - in[2].
r;
49 s2_i = in[0].
i - in[2].
i;
52 s0_r = in[1].
r + in[3].
r;
53 s0_i = in[1].
i + in[3].
i;
54 s1_r = in[1].
r - in[3].
r;
55 s1_i = in[1].
i - in[3].
i;
59 out[2].
r = tmp_r - s0_r;
60 out[2].
i = tmp_i - s0_i;
61 out[0].
r = tmp_r + s0_r;
62 out[0].
i = tmp_i + s0_i;
63 out[1].
r = s2_r + s1_i;
64 out[1].
i = s2_i - s1_r;
65 out[3].
r = s2_r - s1_i;
66 out[3].
i = s2_i + s1_r;
76 tmp_r = in[0].
r + in[2].
r;
77 tmp_i = in[0].
i + in[2].
i;
78 s2_r = in[0].
r - in[2].
r;
79 s2_i = in[0].
i - in[2].
i;
82 s0_r = in[1].
r + in[3].
r;
83 s0_i = in[1].
i + in[3].
i;
84 s1_r = in[1].
r - in[3].
r;
85 s1_i = in[1].
i - in[3].
i;
89 out[2].
r = (tmp_r - s0_r) * 0.25f;
90 out[2].
i = (tmp_i - s0_i) * 0.25f;
91 out[0].
r = (tmp_r + s0_r) * 0.25f;
92 out[0].
i = (tmp_i + s0_i) * 0.25f;
93 out[1].
r = (s2_r - s1_i) * 0.25f;
94 out[1].
i = (s2_i + s1_r) * 0.25f;
95 out[3].
r = (s2_r + s1_i) * 0.25f;
96 out[3].
i = (s2_i - s1_r) * 0.25f;
103 ne10_float32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
104 ne10_float32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
108 s0_r = in[0].
r + in[4].
r;
109 s0_i = in[0].
i + in[4].
i;
110 s1_r = in[0].
r - in[4].
r;
111 s1_i = in[0].
i - in[4].
i;
112 s2_r = in[1].
r + in[5].
r;
113 s2_i = in[1].
i + in[5].
i;
114 s3_r = in[1].
r - in[5].
r;
115 s3_i = in[1].
i - in[5].
i;
116 s4_r = in[2].
r + in[6].
r;
117 s4_i = in[2].
i + in[6].
i;
118 s5_r = in[2].
r - in[6].
r;
119 s5_i = in[2].
i - in[6].
i;
120 s6_r = in[3].
r + in[7].
r;
121 s6_i = in[3].
i + in[7].
i;
122 s7_r = in[3].
r - in[7].
r;
123 s7_i = in[3].
i - in[7].
i;
134 out[0].
r = t1_r + t2_r;
135 out[0].
i = t1_i + t2_i;
136 out[4].
r = t1_r - t2_r;
137 out[4].
i = t1_i - t2_i;
138 out[2].
r = t0_r + t3_i;
139 out[2].
i = t0_i - t3_r;
140 out[6].
r = t0_r - t3_i;
141 out[6].
i = t0_i + t3_r;
145 t4_r = (s3_r + s3_i) * TW_81;
146 t4_i = - (s3_r - s3_i) * TW_81;
147 t5_r = (s7_r - s7_i) * TW_81;
148 t5_i = (s7_r + s7_i) * TW_81;
157 out[1].
r = t1_r + t2_r;
158 out[1].
i = t1_i + t2_i;
159 out[5].
r = t1_r - t2_r;
160 out[5].
i = t1_i - t2_i;
161 out[3].
r = t0_r + t3_i;
162 out[3].
i = t0_i - t3_r;
163 out[7].
r = t0_r - t3_i;
164 out[7].
i = t0_i + t3_r;
170 ne10_float32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
171 ne10_float32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
175 s0_r = in[0].
r + in[4].
r;
176 s0_i = in[0].
i + in[4].
i;
177 s1_r = in[0].
r - in[4].
r;
178 s1_i = in[0].
i - in[4].
i;
179 s2_r = in[1].
r + in[5].
r;
180 s2_i = in[1].
i + in[5].
i;
181 s3_r = in[1].
r - in[5].
r;
182 s3_i = in[1].
i - in[5].
i;
183 s4_r = in[2].
r + in[6].
r;
184 s4_i = in[2].
i + in[6].
i;
185 s5_r = in[2].
r - in[6].
r;
186 s5_i = in[2].
i - in[6].
i;
187 s6_r = in[3].
r + in[7].
r;
188 s6_i = in[3].
i + in[7].
i;
189 s7_r = in[3].
r - in[7].
r;
190 s7_i = in[3].
i - in[7].
i;
202 out[0].
r = (t1_r + t2_r) * 0.125f;
203 out[0].
i = (t1_i + t2_i) * 0.125f;
204 out[4].
r = (t1_r - t2_r) * 0.125f;
205 out[4].
i = (t1_i - t2_i) * 0.125f;
206 out[2].
r = (t0_r - t3_i) * 0.125f;
207 out[2].
i = (t0_i + t3_r) * 0.125f;
208 out[6].
r = (t0_r + t3_i) * 0.125f;
209 out[6].
i = (t0_i - t3_r) * 0.125f;
213 t4_r = (s3_r - s3_i) * TW_81;
214 t4_i = (s3_r + s3_i) * TW_81;
215 t5_r = (s7_r + s7_i) * TW_81;
216 t5_i = - (s7_r - s7_i) * TW_81;
225 out[1].
r = (t1_r + t2_r) * 0.125f;
226 out[1].
i = (t1_i + t2_i) * 0.125f;
227 out[5].
r = (t1_r - t2_r) * 0.125f;
228 out[5].
i = (t1_i - t2_i) * 0.125f;
229 out[3].
r = (t0_r - t3_i) * 0.125f;
230 out[3].
i = (t0_i + t3_r) * 0.125f;
231 out[7].
r = (t0_r + t3_i) * 0.125f;
232 out[7].
i = (t0_i - t3_r) * 0.125f;
242 float32_t *p_src0, *p_src4, *p_src8, *p_src12;
243 float32x4x2_t q2_in_0123, q2_in_4567, q2_in_89ab, q2_in_cdef;
244 float32x4_t q_t0_r, q_t0_i, q_t1_r, q_t1_i, q_t2_r, q_t2_i, q_t3_r, q_t3_i;
245 float32x4_t q_out_r048c, q_out_i048c, q_out_r159d, q_out_i159d;
246 float32x4_t q_out_r26ae, q_out_i26ae, q_out_r37bf, q_out_i37bf;
247 p_src0 = (float32_t*) (& (Fin[0]));
248 p_src4 = (float32_t*) (& (Fin[4]));
249 p_src8 = (float32_t*) (& (Fin[8]));
250 p_src12 = (float32_t*) (& (Fin[12]));
251 q2_in_0123 = vld2q_f32 (p_src0);
252 q2_in_4567 = vld2q_f32 (p_src4);
253 q2_in_89ab = vld2q_f32 (p_src8);
254 q2_in_cdef = vld2q_f32 (p_src12);
256 q_t2_r = vsubq_f32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
257 q_t2_i = vsubq_f32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
258 q_t3_r = vaddq_f32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
259 q_t3_i = vaddq_f32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
261 q_t0_r = vaddq_f32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
262 q_t0_i = vaddq_f32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
263 q_t1_r = vsubq_f32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
264 q_t1_i = vsubq_f32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
266 q_out_r26ae = vsubq_f32 (q_t3_r, q_t0_r);
267 q_out_i26ae = vsubq_f32 (q_t3_i, q_t0_i);
268 q_out_r048c = vaddq_f32 (q_t3_r, q_t0_r);
269 q_out_i048c = vaddq_f32 (q_t3_i, q_t0_i);
270 q_out_r159d = vaddq_f32 (q_t2_r, q_t1_i);
271 q_out_i159d = vsubq_f32 (q_t2_i, q_t1_r);
272 q_out_r37bf = vsubq_f32 (q_t2_r, q_t1_i);
273 q_out_i37bf = vaddq_f32 (q_t2_i, q_t1_r);
276 float32_t *p_dst0, *p_dst1, *p_dst2, *p_dst3;
277 float32_t *p_tw1, *p_tw2, *p_tw3;
278 float32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i;
279 float32x4_t q_s3_r, q_s3_i, q_s4_r, q_s4_i, q_s5_r, q_s5_i;
280 float32x4x2_t q2_tmp_0, q2_tmp_1, q2_tmp_2, q2_tmp_3;
281 float32x4_t q_in_r0123, q_in_r4567, q_in_r89ab, q_in_rcdef;
282 float32x4_t q_in_i0123, q_in_i4567, q_in_i89ab, q_in_icdef;
283 float32x4x2_t q2_tw1, q2_tw2, q2_tw3;
284 float32x4x2_t q2_out_0123, q2_out_4567, q2_out_89ab, q2_out_cdef;
288 p_dst0 = (float32_t*) (&Fout[0]);
289 p_dst1 = (float32_t*) (&Fout[4]);
290 p_dst2 = (float32_t*) (&Fout[8]);
291 p_dst3 = (float32_t*) (&Fout[12]);
292 p_tw1 = (float32_t*) tw1;
293 p_tw2 = (float32_t*) tw2;
294 p_tw3 = (float32_t*) tw3;
295 q2_tmp_0 = vzipq_f32 (q_out_r048c, q_out_r159d);
296 q2_tmp_1 = vzipq_f32 (q_out_i048c, q_out_i159d);
297 q2_tmp_2 = vzipq_f32 (q_out_r26ae, q_out_r37bf);
298 q2_tmp_3 = vzipq_f32 (q_out_i26ae, q_out_i37bf);
299 q_in_r0123 = vcombine_f32 (vget_low_f32 (q2_tmp_0.val[0]), vget_low_f32 (q2_tmp_2.val[0]));
300 q_in_i0123 = vcombine_f32 (vget_low_f32 (q2_tmp_1.val[0]), vget_low_f32 (q2_tmp_3.val[0]));
301 q_in_r4567 = vcombine_f32 (vget_high_f32 (q2_tmp_0.val[0]), vget_high_f32 (q2_tmp_2.val[0]));
302 q_in_i4567 = vcombine_f32 (vget_high_f32 (q2_tmp_1.val[0]), vget_high_f32 (q2_tmp_3.val[0]));
303 q_in_r89ab = vcombine_f32 (vget_low_f32 (q2_tmp_0.val[1]), vget_low_f32 (q2_tmp_2.val[1]));
304 q_in_i89ab = vcombine_f32 (vget_low_f32 (q2_tmp_1.val[1]), vget_low_f32 (q2_tmp_3.val[1]));
305 q_in_rcdef = vcombine_f32 (vget_high_f32 (q2_tmp_0.val[1]), vget_high_f32 (q2_tmp_2.val[1]));
306 q_in_icdef = vcombine_f32 (vget_high_f32 (q2_tmp_1.val[1]), vget_high_f32 (q2_tmp_3.val[1]));
307 q2_tw1 = vld2q_f32 (p_tw1);
308 q2_tw2 = vld2q_f32 (p_tw2);
309 q2_tw3 = vld2q_f32 (p_tw3);
311 q_s0_r = vmulq_f32 (q_in_r4567, q2_tw1.val[0]);
312 q_s0_i = vmulq_f32 (q_in_r4567, q2_tw1.val[1]);
313 q_s1_r = vmulq_f32 (q_in_r89ab, q2_tw2.val[0]);
314 q_s1_i = vmulq_f32 (q_in_r89ab, q2_tw2.val[1]);
315 q_s2_r = vmulq_f32 (q_in_rcdef, q2_tw3.val[0]);
316 q_s2_i = vmulq_f32 (q_in_rcdef, q2_tw3.val[1]);
317 q_s0_r = vmlsq_f32 (q_s0_r, q_in_i4567, q2_tw1.val[1]);
318 q_s0_i = vmlaq_f32 (q_s0_i, q_in_i4567, q2_tw1.val[0]);
319 q_s1_r = vmlsq_f32 (q_s1_r, q_in_i89ab, q2_tw2.val[1]);
320 q_s1_i = vmlaq_f32 (q_s1_i, q_in_i89ab, q2_tw2.val[0]);
321 q_s2_r = vmlsq_f32 (q_s2_r, q_in_icdef, q2_tw3.val[1]);
322 q_s2_i = vmlaq_f32 (q_s2_i, q_in_icdef, q2_tw3.val[0]);
325 q_s5_r = vsubq_f32 (q_in_r0123, q_s1_r);
326 q_s5_i = vsubq_f32 (q_in_i0123, q_s1_i);
327 q2_out_0123.val[0] = vaddq_f32 (q_in_r0123, q_s1_r);
328 q2_out_0123.val[1] = vaddq_f32 (q_in_i0123, q_s1_i);
330 q_s3_r = vaddq_f32 (q_s0_r, q_s2_r);
331 q_s3_i = vaddq_f32 (q_s0_i, q_s2_i);
332 q_s4_r = vsubq_f32 (q_s0_r, q_s2_r);
333 q_s4_i = vsubq_f32 (q_s0_i, q_s2_i);
335 q2_out_89ab.val[0] = vsubq_f32 (q2_out_0123.val[0], q_s3_r);
336 q2_out_89ab.val[1] = vsubq_f32 (q2_out_0123.val[1], q_s3_i);
337 q2_out_0123.val[0] = vaddq_f32 (q2_out_0123.val[0], q_s3_r);
338 q2_out_0123.val[1] = vaddq_f32 (q2_out_0123.val[1], q_s3_i);
340 q2_out_4567.val[0] = vaddq_f32 (q_s5_r, q_s4_i);
341 q2_out_4567.val[1] = vsubq_f32 (q_s5_i, q_s4_r);
342 q2_out_cdef.val[0] = vsubq_f32 (q_s5_r, q_s4_i);
343 q2_out_cdef.val[1] = vaddq_f32 (q_s5_i, q_s4_r);
345 vst2q_f32 (p_dst0, q2_out_0123);
346 vst2q_f32 (p_dst1, q2_out_4567);
347 vst2q_f32 (p_dst2, q2_out_89ab);
348 vst2q_f32 (p_dst3, q2_out_cdef);
358 float32_t *p_src0, *p_src4, *p_src8, *p_src12;
359 float32x4x2_t q2_in_0123, q2_in_4567, q2_in_89ab, q2_in_cdef;
360 float32x4_t q_t0_r, q_t0_i, q_t1_r, q_t1_i, q_t2_r, q_t2_i, q_t3_r, q_t3_i;
361 float32x4_t q_out_r048c, q_out_i048c, q_out_r159d, q_out_i159d;
362 float32x4_t q_out_r26ae, q_out_i26ae, q_out_r37bf, q_out_i37bf;
363 p_src0 = (float32_t*) (& (Fin[0]));
364 p_src4 = (float32_t*) (& (Fin[4]));
365 p_src8 = (float32_t*) (& (Fin[8]));
366 p_src12 = (float32_t*) (& (Fin[12]));
367 q2_in_0123 = vld2q_f32 (p_src0);
368 q2_in_4567 = vld2q_f32 (p_src4);
369 q2_in_89ab = vld2q_f32 (p_src8);
370 q2_in_cdef = vld2q_f32 (p_src12);
372 q_t2_r = vsubq_f32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
373 q_t2_i = vsubq_f32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
374 q_t3_r = vaddq_f32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
375 q_t3_i = vaddq_f32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
377 q_t0_r = vaddq_f32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
378 q_t0_i = vaddq_f32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
379 q_t1_r = vsubq_f32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
380 q_t1_i = vsubq_f32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
382 q_out_r26ae = vsubq_f32 (q_t3_r, q_t0_r);
383 q_out_i26ae = vsubq_f32 (q_t3_i, q_t0_i);
384 q_out_r048c = vaddq_f32 (q_t3_r, q_t0_r);
385 q_out_i048c = vaddq_f32 (q_t3_i, q_t0_i);
386 q_out_r159d = vsubq_f32 (q_t2_r, q_t1_i);
387 q_out_i159d = vaddq_f32 (q_t2_i, q_t1_r);
388 q_out_r37bf = vaddq_f32 (q_t2_r, q_t1_i);
389 q_out_i37bf = vsubq_f32 (q_t2_i, q_t1_r);
392 float32_t *p_dst0, *p_dst1, *p_dst2, *p_dst3;
393 float32_t *p_tw1, *p_tw2, *p_tw3;
394 float32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i;
395 float32x4_t q_s3_r, q_s3_i, q_s4_r, q_s4_i, q_s5_r, q_s5_i;
396 float32x4x2_t q2_tmp_0, q2_tmp_1, q2_tmp_2, q2_tmp_3;
397 float32x4_t q_in_r0123, q_in_r4567, q_in_r89ab, q_in_rcdef;
398 float32x4_t q_in_i0123, q_in_i4567, q_in_i89ab, q_in_icdef;
399 float32x4x2_t q2_tw1, q2_tw2, q2_tw3;
400 float32x4x2_t q2_out_0123, q2_out_4567, q2_out_89ab, q2_out_cdef;
401 float32x4_t q_one_by_nfft;
405 p_dst0 = (float32_t*) (&Fout[0]);
406 p_dst1 = (float32_t*) (&Fout[4]);
407 p_dst2 = (float32_t*) (&Fout[8]);
408 p_dst3 = (float32_t*) (&Fout[12]);
409 p_tw1 = (float32_t*) tw1;
410 p_tw2 = (float32_t*) tw2;
411 p_tw3 = (float32_t*) tw3;
412 q2_tmp_0 = vzipq_f32 (q_out_r048c, q_out_r159d);
413 q2_tmp_1 = vzipq_f32 (q_out_i048c, q_out_i159d);
414 q2_tmp_2 = vzipq_f32 (q_out_r26ae, q_out_r37bf);
415 q2_tmp_3 = vzipq_f32 (q_out_i26ae, q_out_i37bf);
416 q_in_r0123 = vcombine_f32 (vget_low_f32 (q2_tmp_0.val[0]), vget_low_f32 (q2_tmp_2.val[0]));
417 q_in_i0123 = vcombine_f32 (vget_low_f32 (q2_tmp_1.val[0]), vget_low_f32 (q2_tmp_3.val[0]));
418 q_in_r4567 = vcombine_f32 (vget_high_f32 (q2_tmp_0.val[0]), vget_high_f32 (q2_tmp_2.val[0]));
419 q_in_i4567 = vcombine_f32 (vget_high_f32 (q2_tmp_1.val[0]), vget_high_f32 (q2_tmp_3.val[0]));
420 q_in_r89ab = vcombine_f32 (vget_low_f32 (q2_tmp_0.val[1]), vget_low_f32 (q2_tmp_2.val[1]));
421 q_in_i89ab = vcombine_f32 (vget_low_f32 (q2_tmp_1.val[1]), vget_low_f32 (q2_tmp_3.val[1]));
422 q_in_rcdef = vcombine_f32 (vget_high_f32 (q2_tmp_0.val[1]), vget_high_f32 (q2_tmp_2.val[1]));
423 q_in_icdef = vcombine_f32 (vget_high_f32 (q2_tmp_1.val[1]), vget_high_f32 (q2_tmp_3.val[1]));
424 q2_tw1 = vld2q_f32 (p_tw1);
425 q2_tw2 = vld2q_f32 (p_tw2);
426 q2_tw3 = vld2q_f32 (p_tw3);
428 q_s0_r = vmulq_f32 (q_in_r4567, q2_tw1.val[0]);
429 q_s0_i = vmulq_f32 (q_in_i4567, q2_tw1.val[0]);
430 q_s1_r = vmulq_f32 (q_in_r89ab, q2_tw2.val[0]);
431 q_s1_i = vmulq_f32 (q_in_i89ab, q2_tw2.val[0]);
432 q_s2_r = vmulq_f32 (q_in_rcdef, q2_tw3.val[0]);
433 q_s2_i = vmulq_f32 (q_in_icdef, q2_tw3.val[0]);
434 q_s0_r = vmlaq_f32 (q_s0_r, q_in_i4567, q2_tw1.val[1]);
435 q_s0_i = vmlsq_f32 (q_s0_i, q_in_r4567, q2_tw1.val[1]);
436 q_s1_r = vmlaq_f32 (q_s1_r, q_in_i89ab, q2_tw2.val[1]);
437 q_s1_i = vmlsq_f32 (q_s1_i, q_in_r89ab, q2_tw2.val[1]);
438 q_s2_r = vmlaq_f32 (q_s2_r, q_in_icdef, q2_tw3.val[1]);
439 q_s2_i = vmlsq_f32 (q_s2_i, q_in_rcdef, q2_tw3.val[1]);
441 q_s5_r = vsubq_f32 (q_in_r0123, q_s1_r);
442 q_s5_i = vsubq_f32 (q_in_i0123, q_s1_i);
443 q2_out_0123.val[0] = vaddq_f32 (q_in_r0123, q_s1_r);
444 q2_out_0123.val[1] = vaddq_f32 (q_in_i0123, q_s1_i);
446 q_s3_r = vaddq_f32 (q_s0_r, q_s2_r);
447 q_s3_i = vaddq_f32 (q_s0_i, q_s2_i);
448 q_s4_r = vsubq_f32 (q_s0_r, q_s2_r);
449 q_s4_i = vsubq_f32 (q_s0_i, q_s2_i);
451 q_one_by_nfft = vdupq_n_f32 (0.0625f);
452 q2_out_89ab.val[0] = vsubq_f32 (q2_out_0123.val[0], q_s3_r);
453 q2_out_89ab.val[1] = vsubq_f32 (q2_out_0123.val[1], q_s3_i);
454 q2_out_0123.val[0] = vaddq_f32 (q2_out_0123.val[0], q_s3_r);
455 q2_out_0123.val[1] = vaddq_f32 (q2_out_0123.val[1], q_s3_i);
457 q2_out_4567.val[0] = vsubq_f32 (q_s5_r, q_s4_i);
458 q2_out_4567.val[1] = vaddq_f32 (q_s5_i, q_s4_r);
459 q2_out_cdef.val[0] = vaddq_f32 (q_s5_r, q_s4_i);
460 q2_out_cdef.val[1] = vsubq_f32 (q_s5_i, q_s4_r);
462 q2_out_89ab.val[0] = vmulq_f32 (q2_out_89ab.val[0], q_one_by_nfft);
463 q2_out_89ab.val[1] = vmulq_f32 (q2_out_89ab.val[1], q_one_by_nfft);
464 q2_out_0123.val[0] = vmulq_f32 (q2_out_0123.val[0], q_one_by_nfft);
465 q2_out_0123.val[1] = vmulq_f32 (q2_out_0123.val[1], q_one_by_nfft);
466 q2_out_4567.val[0] = vmulq_f32 (q2_out_4567.val[0], q_one_by_nfft);
467 q2_out_4567.val[1] = vmulq_f32 (q2_out_4567.val[1], q_one_by_nfft);
468 q2_out_cdef.val[0] = vmulq_f32 (q2_out_cdef.val[0], q_one_by_nfft);
469 q2_out_cdef.val[1] = vmulq_f32 (q2_out_cdef.val[1], q_one_by_nfft);
471 vst2q_f32 (p_dst0, q2_out_0123);
472 vst2q_f32 (p_dst1, q2_out_4567);
473 vst2q_f32 (p_dst2, q2_out_89ab);
474 vst2q_f32 (p_dst3, q2_out_cdef);
483 float32_t *p_src = (float32_t *) in;
484 float32_t *p_dst = (float32_t *) out;
488 float32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3, q2_in4, q2_in5, q2_in6, q2_in7;
489 float32x4_t q_sin0_r, q_sin0_i, q_sin1_r, q_sin1_i, q_sin2_r, q_sin2_i, q_sin3_r, q_sin3_i;
490 float32x4_t q_sin4_r, q_sin4_i, q_sin5_r, q_sin5_i, q_sin6_r, q_sin6_i, q_sin7_r, q_sin7_i;
491 float32x4_t q_s3_r, q_s3_i, q_s5_r, q_s5_i, q_s7_r, q_s7_i;
492 float32x4_t q_s8_r, q_s8_i, q_s9_r, q_s9_i, q_s10_r, q_s10_i, q_s11_r, q_s11_i;
493 float32x4_t q_s12_r, q_s12_i, q_s13_r, q_s13_i, q_s14_r, q_s14_i, q_s15_r, q_s15_i;
494 float32x4_t q_out0_r, q_out0_i, q_out1_r, q_out1_i, q_out2_r, q_out2_i, q_out3_r, q_out3_i;
495 float32x4_t q_out4_r, q_out4_i, q_out5_r, q_out5_i, q_out6_r, q_out6_i, q_out7_r, q_out7_i;
496 float32x4x2_t q2_tmp0, q2_tmp1, q2_tmp2, q2_tmp3, q2_tmp4, q2_tmp5, q2_tmp6, q2_tmp7;
497 float32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3, q2_out4, q2_out5, q2_out6, q2_out7;
498 float32x4_t q_tw_81, q_tw_81n;
503 for (f_count = 0; f_count < stride; f_count += 4)
510 q2_in0 = vld2q_f32 (p_src);
512 q2_in2 = vld2q_f32 (p_src);
514 q2_in4 = vld2q_f32 (p_src);
516 q2_in6 = vld2q_f32 (p_src);
518 q2_in1 = vld2q_f32 (p_src);
520 q2_in3 = vld2q_f32 (p_src);
522 q2_in5 = vld2q_f32 (p_src);
524 q2_in7 = vld2q_f32 (p_src);
529 q_sin0_r = vaddq_f32 (q2_in0.val[0], q2_in1.val[0]);
530 q_sin0_i = vaddq_f32 (q2_in0.val[1], q2_in1.val[1]);
531 q_sin1_r = vsubq_f32 (q2_in0.val[0], q2_in1.val[0]);
532 q_sin1_i = vsubq_f32 (q2_in0.val[1], q2_in1.val[1]);
533 q_sin2_r = vaddq_f32 (q2_in2.val[0], q2_in3.val[0]);
534 q_sin2_i = vaddq_f32 (q2_in2.val[1], q2_in3.val[1]);
535 q_sin3_r = vsubq_f32 (q2_in2.val[0], q2_in3.val[0]);
536 q_sin3_i = vsubq_f32 (q2_in2.val[1], q2_in3.val[1]);
537 q_sin4_r = vaddq_f32 (q2_in4.val[0], q2_in5.val[0]);
538 q_sin4_i = vaddq_f32 (q2_in4.val[1], q2_in5.val[1]);
539 q_sin5_r = vsubq_f32 (q2_in4.val[0], q2_in5.val[0]);
540 q_sin5_i = vsubq_f32 (q2_in4.val[1], q2_in5.val[1]);
541 q_sin6_r = vaddq_f32 (q2_in6.val[0], q2_in7.val[0]);
542 q_sin6_i = vaddq_f32 (q2_in6.val[1], q2_in7.val[1]);
543 q_sin7_r = vsubq_f32 (q2_in6.val[0], q2_in7.val[0]);
544 q_sin7_i = vsubq_f32 (q2_in6.val[1], q2_in7.val[1]);
547 q_tw_81 = vdupq_n_f32 (TW_81);
548 q_tw_81n = vdupq_n_f32 (TW_81N);
550 q_s5_i = vnegq_f32 (q_sin5_r);
551 q_s3_r = vaddq_f32 (q_sin3_r, q_sin3_i);
552 q_s3_i = vsubq_f32 (q_sin3_i, q_sin3_r);
553 q_s7_r = vsubq_f32 (q_sin7_r, q_sin7_i);
554 q_s7_i = vaddq_f32 (q_sin7_i, q_sin7_r);
555 q_s3_r = vmulq_f32 (q_s3_r, q_tw_81);
556 q_s3_i = vmulq_f32 (q_s3_i, q_tw_81);
557 q_s7_r = vmulq_f32 (q_s7_r, q_tw_81n);
558 q_s7_i = vmulq_f32 (q_s7_i, q_tw_81n);
561 q_s8_r = vaddq_f32 (q_sin0_r, q_sin4_r);
562 q_s8_i = vaddq_f32 (q_sin0_i, q_sin4_i);
563 q_s9_r = vaddq_f32 (q_sin1_r, q_s5_r);
564 q_s9_i = vaddq_f32 (q_sin1_i, q_s5_i);
565 q_s10_r = vsubq_f32 (q_sin0_r, q_sin4_r);
566 q_s10_i = vsubq_f32 (q_sin0_i, q_sin4_i);
567 q_s11_r = vsubq_f32 (q_sin1_r, q_s5_r);
568 q_s11_i = vsubq_f32 (q_sin1_i, q_s5_i);
571 q_s12_r = vaddq_f32 (q_sin2_r, q_sin6_r);
572 q_s12_i = vaddq_f32 (q_sin2_i, q_sin6_i);
573 q_s13_r = vaddq_f32 (q_s3_r, q_s7_r);
574 q_s13_i = vaddq_f32 (q_s3_i, q_s7_i);
575 q_s14_r = vsubq_f32 (q_sin2_r, q_sin6_r);
576 q_s14_i = vsubq_f32 (q_sin2_i, q_sin6_i);
577 q_s15_r = vsubq_f32 (q_s3_r, q_s7_r);
578 q_s15_i = vsubq_f32 (q_s3_i, q_s7_i);
581 q_out4_r = vsubq_f32 (q_s8_r, q_s12_r);
582 q_out4_i = vsubq_f32 (q_s8_i, q_s12_i);
583 q_out5_r = vsubq_f32 (q_s9_r, q_s13_r);
584 q_out5_i = vsubq_f32 (q_s9_i, q_s13_i);
585 q_out0_r = vaddq_f32 (q_s8_r, q_s12_r);
586 q_out0_i = vaddq_f32 (q_s8_i, q_s12_i);
587 q_out1_r = vaddq_f32 (q_s9_r, q_s13_r);
588 q_out1_i = vaddq_f32 (q_s9_i, q_s13_i);
589 q_out2_r = vaddq_f32 (q_s10_r, q_s14_i);
590 q_out2_i = vsubq_f32 (q_s10_i, q_s14_r);
591 q_out3_r = vaddq_f32 (q_s11_r, q_s15_i);
592 q_out3_i = vsubq_f32 (q_s11_i, q_s15_r);
593 q_out6_r = vsubq_f32 (q_s10_r, q_s14_i);
594 q_out6_i = vaddq_f32 (q_s10_i, q_s14_r);
595 q_out7_r = vsubq_f32 (q_s11_r, q_s15_i);
596 q_out7_i = vaddq_f32 (q_s11_i, q_s15_r);
603 q2_tmp0 = vtrnq_f32 (q_out0_r, q_out1_r);
604 q2_tmp1 = vtrnq_f32 (q_out0_i, q_out1_i);
605 q2_tmp2 = vtrnq_f32 (q_out2_r, q_out3_r);
606 q2_tmp3 = vtrnq_f32 (q_out2_i, q_out3_i);
607 q2_tmp4 = vtrnq_f32 (q_out4_r, q_out5_r);
608 q2_tmp5 = vtrnq_f32 (q_out4_i, q_out5_i);
609 q2_tmp6 = vtrnq_f32 (q_out6_r, q_out7_r);
610 q2_tmp7 = vtrnq_f32 (q_out6_i, q_out7_i);
611 q2_out0.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[0]), vget_low_f32 (q2_tmp2.val[0]));
612 q2_out0.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[0]), vget_low_f32 (q2_tmp3.val[0]));
613 q2_out2.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[1]), vget_low_f32 (q2_tmp2.val[1]));
614 q2_out2.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[1]), vget_low_f32 (q2_tmp3.val[1]));
615 q2_out4.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[0]), vget_high_f32 (q2_tmp2.val[0]));
616 q2_out4.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[0]), vget_high_f32 (q2_tmp3.val[0]));
617 q2_out6.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[1]), vget_high_f32 (q2_tmp2.val[1]));
618 q2_out6.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[1]), vget_high_f32 (q2_tmp3.val[1]));
619 q2_out1.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp4.val[0]), vget_low_f32 (q2_tmp6.val[0]));
620 q2_out1.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp5.val[0]), vget_low_f32 (q2_tmp7.val[0]));
621 q2_out3.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp4.val[1]), vget_low_f32 (q2_tmp6.val[1]));
622 q2_out3.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp5.val[1]), vget_low_f32 (q2_tmp7.val[1]));
623 q2_out5.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp4.val[0]), vget_high_f32 (q2_tmp6.val[0]));
624 q2_out5.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp5.val[0]), vget_high_f32 (q2_tmp7.val[0]));
625 q2_out7.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp4.val[1]), vget_high_f32 (q2_tmp6.val[1]));
626 q2_out7.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp5.val[1]), vget_high_f32 (q2_tmp7.val[1]));
629 vst2q_f32 (p_dst, q2_out0);
631 vst2q_f32 (p_dst, q2_out1);
633 vst2q_f32 (p_dst, q2_out2);
635 vst2q_f32 (p_dst, q2_out3);
637 vst2q_f32 (p_dst, q2_out4);
639 vst2q_f32 (p_dst, q2_out5);
641 vst2q_f32 (p_dst, q2_out6);
643 vst2q_f32 (p_dst, q2_out7);
648 p_src = p_src - src_step * 8 + 8;
658 float32_t *p_src = (float32_t *) in;
659 float32_t *p_dst = (float32_t *) out;
661 float32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3;
662 float32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i, q_s3_r, q_s3_i;
663 float32x4_t q_out0_r, q_out0_i, q_out1_r, q_out1_i, q_out2_r, q_out2_i, q_out3_r, q_out3_i;
664 float32x4x2_t q2_tmp0, q2_tmp1, q2_tmp2, q2_tmp3;
665 float32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3;
669 for (f_count = 0; f_count < stride; f_count += 4)
672 q2_in0 = vld2q_f32 (p_src);
674 q2_in1 = vld2q_f32 (p_src);
676 q2_in2 = vld2q_f32 (p_src);
678 q2_in3 = vld2q_f32 (p_src);
683 q_s0_r = vaddq_f32 (q2_in0.val[0], q2_in2.val[0]);
684 q_s0_i = vaddq_f32 (q2_in0.val[1], q2_in2.val[1]);
685 q_s1_r = vsubq_f32 (q2_in0.val[0], q2_in2.val[0]);
686 q_s1_i = vsubq_f32 (q2_in0.val[1], q2_in2.val[1]);
687 q_s2_r = vaddq_f32 (q2_in1.val[0], q2_in3.val[0]);
688 q_s2_i = vaddq_f32 (q2_in1.val[1], q2_in3.val[1]);
689 q_s3_r = vsubq_f32 (q2_in1.val[0], q2_in3.val[0]);
690 q_s3_i = vsubq_f32 (q2_in1.val[1], q2_in3.val[1]);
693 q_out2_r = vsubq_f32 (q_s0_r, q_s2_r);
694 q_out2_i = vsubq_f32 (q_s0_i, q_s2_i);
695 q_out0_r = vaddq_f32 (q_s0_r, q_s2_r);
696 q_out0_i = vaddq_f32 (q_s0_i, q_s2_i);
697 q_out1_r = vaddq_f32 (q_s1_r, q_s3_i);
698 q_out1_i = vsubq_f32 (q_s1_i, q_s3_r);
699 q_out3_r = vsubq_f32 (q_s1_r, q_s3_i);
700 q_out3_i = vaddq_f32 (q_s1_i, q_s3_r);
703 q2_tmp0 = vtrnq_f32 (q_out0_r, q_out1_r);
704 q2_tmp1 = vtrnq_f32 (q_out0_i, q_out1_i);
705 q2_tmp2 = vtrnq_f32 (q_out2_r, q_out3_r);
706 q2_tmp3 = vtrnq_f32 (q_out2_i, q_out3_i);
707 q2_out0.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[0]), vget_low_f32 (q2_tmp2.val[0]));
708 q2_out0.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[0]), vget_low_f32 (q2_tmp3.val[0]));
709 q2_out1.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[1]), vget_low_f32 (q2_tmp2.val[1]));
710 q2_out1.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[1]), vget_low_f32 (q2_tmp3.val[1]));
711 q2_out2.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[0]), vget_high_f32 (q2_tmp2.val[0]));
712 q2_out2.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[0]), vget_high_f32 (q2_tmp3.val[0]));
713 q2_out3.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[1]), vget_high_f32 (q2_tmp2.val[1]));
714 q2_out3.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[1]), vget_high_f32 (q2_tmp3.val[1]));
717 vst2q_f32 (p_dst, q2_out0);
719 vst2q_f32 (p_dst, q2_out1);
721 vst2q_f32 (p_dst, q2_out2);
723 vst2q_f32 (p_dst, q2_out3);
727 p_src = p_src - src_step * 4 + 8;
742 float32_t *p_src = (float32_t *) in;
743 float32_t *p_dst = (float32_t *) out;
744 float32_t *p_tw = (float32_t *) tw;
746 float32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3;
747 float32x4x2_t q2_tw0, q2_tw1, q2_tw2;
748 float32x4_t q_s1_r, q_s1_i, q_s2_r, q_s2_i, q_s3_r, q_s3_i;
749 float32x4_t q_s4_r, q_s4_i, q_s5_r, q_s5_i, q_s6_r, q_s6_i, q_s7_r, q_s7_i;
750 float32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3;
754 for (m_count = 0; m_count < mstride; m_count += 4)
757 q2_in0 = vld2q_f32 (p_src);
759 q2_in1 = vld2q_f32 (p_src);
761 q2_in2 = vld2q_f32 (p_src);
763 q2_in3 = vld2q_f32 (p_src);
767 q2_tw0 = vld2q_f32 (p_tw);
769 q2_tw1 = vld2q_f32 (p_tw);
771 q2_tw2 = vld2q_f32 (p_tw);
774 q_s1_r = vmulq_f32 (q2_in1.val[0], q2_tw0.val[0]);
775 q_s1_i = vmulq_f32 (q2_in1.val[1], q2_tw0.val[0]);
776 q_s2_r = vmulq_f32 (q2_in2.val[0], q2_tw1.val[0]);
777 q_s2_i = vmulq_f32 (q2_in2.val[1], q2_tw1.val[0]);
778 q_s3_r = vmulq_f32 (q2_in3.val[0], q2_tw2.val[0]);
779 q_s3_i = vmulq_f32 (q2_in3.val[1], q2_tw2.val[0]);
780 q_s1_r = vmlsq_f32 (q_s1_r, q2_in1.val[1], q2_tw0.val[1]);
781 q_s1_i = vmlaq_f32 (q_s1_i, q2_in1.val[0], q2_tw0.val[1]);
782 q_s2_r = vmlsq_f32 (q_s2_r, q2_in2.val[1], q2_tw1.val[1]);
783 q_s2_i = vmlaq_f32 (q_s2_i, q2_in2.val[0], q2_tw1.val[1]);
784 q_s3_r = vmlsq_f32 (q_s3_r, q2_in3.val[1], q2_tw2.val[1]);
785 q_s3_i = vmlaq_f32 (q_s3_i, q2_in3.val[0], q2_tw2.val[1]);
789 q_s4_r = vaddq_f32 (q2_in0.val[0], q_s2_r);
790 q_s4_i = vaddq_f32 (q2_in0.val[1], q_s2_i);
791 q_s5_r = vsubq_f32 (q2_in0.val[0], q_s2_r);
792 q_s5_i = vsubq_f32 (q2_in0.val[1], q_s2_i);
793 q_s6_r = vaddq_f32 (q_s1_r, q_s3_r);
794 q_s6_i = vaddq_f32 (q_s1_i, q_s3_i);
795 q_s7_r = vsubq_f32 (q_s1_r, q_s3_r);
796 q_s7_i = vsubq_f32 (q_s1_i, q_s3_i);
799 q2_out2.val[0] = vsubq_f32 (q_s4_r, q_s6_r);
800 q2_out2.val[1] = vsubq_f32 (q_s4_i, q_s6_i);
801 q2_out0.val[0] = vaddq_f32 (q_s4_r, q_s6_r);
802 q2_out0.val[1] = vaddq_f32 (q_s4_i, q_s6_i);
803 q2_out1.val[0] = vaddq_f32 (q_s5_r, q_s7_i);
804 q2_out1.val[1] = vsubq_f32 (q_s5_i, q_s7_r);
805 q2_out3.val[0] = vsubq_f32 (q_s5_r, q_s7_i);
806 q2_out3.val[1] = vaddq_f32 (q_s5_i, q_s7_r);
809 vst2q_f32 (p_dst, q2_out0);
811 vst2q_f32 (p_dst, q2_out1);
813 vst2q_f32 (p_dst, q2_out2);
815 vst2q_f32 (p_dst, q2_out3);
820 p_src = p_src - src_step * 4 + 8;
821 p_dst = p_dst - dst_step * 4 + 8;
822 p_tw = p_tw - tw_step * 2 + 8;
831 float32_t *p_src = (float32_t *) in;
832 float32_t *p_dst = (float32_t *) out;
836 float32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3, q2_in4, q2_in5, q2_in6, q2_in7;
837 float32x4_t q_sin0_r, q_sin0_i, q_sin1_r, q_sin1_i, q_sin2_r, q_sin2_i, q_sin3_r, q_sin3_i;
838 float32x4_t q_sin4_r, q_sin4_i, q_sin5_r, q_sin5_i, q_sin6_r, q_sin6_i, q_sin7_r, q_sin7_i;
839 float32x4_t q_s3_r, q_s3_i, q_s5_r, q_s5_i, q_s7_r, q_s7_i;
840 float32x4_t q_s8_r, q_s8_i, q_s9_r, q_s9_i, q_s10_r, q_s10_i, q_s11_r, q_s11_i;
841 float32x4_t q_s12_r, q_s12_i, q_s13_r, q_s13_i, q_s14_r, q_s14_i, q_s15_r, q_s15_i;
842 float32x4_t q_out0_r, q_out0_i, q_out1_r, q_out1_i, q_out2_r, q_out2_i, q_out3_r, q_out3_i;
843 float32x4_t q_out4_r, q_out4_i, q_out5_r, q_out5_i, q_out6_r, q_out6_i, q_out7_r, q_out7_i;
844 float32x4x2_t q2_tmp0, q2_tmp1, q2_tmp2, q2_tmp3, q2_tmp4, q2_tmp5, q2_tmp6, q2_tmp7;
845 float32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3, q2_out4, q2_out5, q2_out6, q2_out7;
846 float32x4_t q_tw_81, q_tw_81n;
850 for (f_count = 0; f_count < stride; f_count += 4)
853 q2_in0 = vld2q_f32 (p_src);
855 q2_in2 = vld2q_f32 (p_src);
857 q2_in4 = vld2q_f32 (p_src);
859 q2_in6 = vld2q_f32 (p_src);
861 q2_in1 = vld2q_f32 (p_src);
863 q2_in3 = vld2q_f32 (p_src);
865 q2_in5 = vld2q_f32 (p_src);
867 q2_in7 = vld2q_f32 (p_src);
872 q_sin0_r = vaddq_f32 (q2_in0.val[0], q2_in1.val[0]);
873 q_sin0_i = vaddq_f32 (q2_in0.val[1], q2_in1.val[1]);
874 q_sin1_r = vsubq_f32 (q2_in0.val[0], q2_in1.val[0]);
875 q_sin1_i = vsubq_f32 (q2_in0.val[1], q2_in1.val[1]);
876 q_sin2_r = vaddq_f32 (q2_in2.val[0], q2_in3.val[0]);
877 q_sin2_i = vaddq_f32 (q2_in2.val[1], q2_in3.val[1]);
878 q_sin3_r = vsubq_f32 (q2_in2.val[0], q2_in3.val[0]);
879 q_sin3_i = vsubq_f32 (q2_in2.val[1], q2_in3.val[1]);
880 q_sin4_r = vaddq_f32 (q2_in4.val[0], q2_in5.val[0]);
881 q_sin4_i = vaddq_f32 (q2_in4.val[1], q2_in5.val[1]);
882 q_sin5_r = vsubq_f32 (q2_in4.val[0], q2_in5.val[0]);
883 q_sin5_i = vsubq_f32 (q2_in4.val[1], q2_in5.val[1]);
884 q_sin6_r = vaddq_f32 (q2_in6.val[0], q2_in7.val[0]);
885 q_sin6_i = vaddq_f32 (q2_in6.val[1], q2_in7.val[1]);
886 q_sin7_r = vsubq_f32 (q2_in6.val[0], q2_in7.val[0]);
887 q_sin7_i = vsubq_f32 (q2_in6.val[1], q2_in7.val[1]);
890 q_tw_81 = vdupq_n_f32 (TW_81);
891 q_tw_81n = vdupq_n_f32 (TW_81N);
892 q_s5_r = vnegq_f32 (q_sin5_i);
894 q_s3_r = vsubq_f32 (q_sin3_r, q_sin3_i);
895 q_s3_i = vaddq_f32 (q_sin3_i, q_sin3_r);
896 q_s7_r = vaddq_f32 (q_sin7_r, q_sin7_i);
897 q_s7_i = vsubq_f32 (q_sin7_i, q_sin7_r);
898 q_s3_r = vmulq_f32 (q_s3_r, q_tw_81);
899 q_s3_i = vmulq_f32 (q_s3_i, q_tw_81);
900 q_s7_r = vmulq_f32 (q_s7_r, q_tw_81n);
901 q_s7_i = vmulq_f32 (q_s7_i, q_tw_81n);
904 q_s8_r = vaddq_f32 (q_sin0_r, q_sin4_r);
905 q_s8_i = vaddq_f32 (q_sin0_i, q_sin4_i);
906 q_s9_r = vaddq_f32 (q_sin1_r, q_s5_r);
907 q_s9_i = vaddq_f32 (q_sin1_i, q_s5_i);
908 q_s10_r = vsubq_f32 (q_sin0_r, q_sin4_r);
909 q_s10_i = vsubq_f32 (q_sin0_i, q_sin4_i);
910 q_s11_r = vsubq_f32 (q_sin1_r, q_s5_r);
911 q_s11_i = vsubq_f32 (q_sin1_i, q_s5_i);
914 q_s12_r = vaddq_f32 (q_sin2_r, q_sin6_r);
915 q_s12_i = vaddq_f32 (q_sin2_i, q_sin6_i);
916 q_s13_r = vaddq_f32 (q_s3_r, q_s7_r);
917 q_s13_i = vaddq_f32 (q_s3_i, q_s7_i);
918 q_s14_r = vsubq_f32 (q_sin2_r, q_sin6_r);
919 q_s14_i = vsubq_f32 (q_sin2_i, q_sin6_i);
920 q_s15_r = vsubq_f32 (q_s3_r, q_s7_r);
921 q_s15_i = vsubq_f32 (q_s3_i, q_s7_i);
924 q_out4_r = vsubq_f32 (q_s8_r, q_s12_r);
925 q_out4_i = vsubq_f32 (q_s8_i, q_s12_i);
926 q_out5_r = vsubq_f32 (q_s9_r, q_s13_r);
927 q_out5_i = vsubq_f32 (q_s9_i, q_s13_i);
928 q_out0_r = vaddq_f32 (q_s8_r, q_s12_r);
929 q_out0_i = vaddq_f32 (q_s8_i, q_s12_i);
930 q_out1_r = vaddq_f32 (q_s9_r, q_s13_r);
931 q_out1_i = vaddq_f32 (q_s9_i, q_s13_i);
932 q_out2_r = vsubq_f32 (q_s10_r, q_s14_i);
933 q_out2_i = vaddq_f32 (q_s10_i, q_s14_r);
934 q_out3_r = vsubq_f32 (q_s11_r, q_s15_i);
935 q_out3_i = vaddq_f32 (q_s11_i, q_s15_r);
936 q_out6_r = vaddq_f32 (q_s10_r, q_s14_i);
937 q_out6_i = vsubq_f32 (q_s10_i, q_s14_r);
938 q_out7_r = vaddq_f32 (q_s11_r, q_s15_i);
939 q_out7_i = vsubq_f32 (q_s11_i, q_s15_r);
942 q2_tmp0 = vtrnq_f32 (q_out0_r, q_out1_r);
943 q2_tmp1 = vtrnq_f32 (q_out0_i, q_out1_i);
944 q2_tmp2 = vtrnq_f32 (q_out2_r, q_out3_r);
945 q2_tmp3 = vtrnq_f32 (q_out2_i, q_out3_i);
946 q2_tmp4 = vtrnq_f32 (q_out4_r, q_out5_r);
947 q2_tmp5 = vtrnq_f32 (q_out4_i, q_out5_i);
948 q2_tmp6 = vtrnq_f32 (q_out6_r, q_out7_r);
949 q2_tmp7 = vtrnq_f32 (q_out6_i, q_out7_i);
950 q2_out0.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[0]), vget_low_f32 (q2_tmp2.val[0]));
951 q2_out0.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[0]), vget_low_f32 (q2_tmp3.val[0]));
952 q2_out2.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[1]), vget_low_f32 (q2_tmp2.val[1]));
953 q2_out2.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[1]), vget_low_f32 (q2_tmp3.val[1]));
954 q2_out4.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[0]), vget_high_f32 (q2_tmp2.val[0]));
955 q2_out4.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[0]), vget_high_f32 (q2_tmp3.val[0]));
956 q2_out6.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[1]), vget_high_f32 (q2_tmp2.val[1]));
957 q2_out6.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[1]), vget_high_f32 (q2_tmp3.val[1]));
958 q2_out1.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp4.val[0]), vget_low_f32 (q2_tmp6.val[0]));
959 q2_out1.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp5.val[0]), vget_low_f32 (q2_tmp7.val[0]));
960 q2_out3.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp4.val[1]), vget_low_f32 (q2_tmp6.val[1]));
961 q2_out3.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp5.val[1]), vget_low_f32 (q2_tmp7.val[1]));
962 q2_out5.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp4.val[0]), vget_high_f32 (q2_tmp6.val[0]));
963 q2_out5.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp5.val[0]), vget_high_f32 (q2_tmp7.val[0]));
964 q2_out7.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp4.val[1]), vget_high_f32 (q2_tmp6.val[1]));
965 q2_out7.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp5.val[1]), vget_high_f32 (q2_tmp7.val[1]));
968 vst2q_f32 (p_dst, q2_out0);
970 vst2q_f32 (p_dst, q2_out1);
972 vst2q_f32 (p_dst, q2_out2);
974 vst2q_f32 (p_dst, q2_out3);
976 vst2q_f32 (p_dst, q2_out4);
978 vst2q_f32 (p_dst, q2_out5);
980 vst2q_f32 (p_dst, q2_out6);
982 vst2q_f32 (p_dst, q2_out7);
986 p_src = p_src - src_step * 8 + 8;
996 float32_t *p_src = (float32_t *) in;
997 float32_t *p_dst = (float32_t *) out;
999 float32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3;
1000 float32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i, q_s3_r, q_s3_i;
1001 float32x4_t q_out0_r, q_out0_i, q_out1_r, q_out1_i, q_out2_r, q_out2_i, q_out3_r, q_out3_i;
1002 float32x4x2_t q2_tmp0, q2_tmp1, q2_tmp2, q2_tmp3;
1003 float32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3;
1007 for (f_count = 0; f_count < stride; f_count += 4)
1010 q2_in0 = vld2q_f32 (p_src);
1012 q2_in1 = vld2q_f32 (p_src);
1014 q2_in2 = vld2q_f32 (p_src);
1016 q2_in3 = vld2q_f32 (p_src);
1021 q_s0_r = vaddq_f32 (q2_in0.val[0], q2_in2.val[0]);
1022 q_s0_i = vaddq_f32 (q2_in0.val[1], q2_in2.val[1]);
1023 q_s1_r = vsubq_f32 (q2_in0.val[0], q2_in2.val[0]);
1024 q_s1_i = vsubq_f32 (q2_in0.val[1], q2_in2.val[1]);
1025 q_s2_r = vaddq_f32 (q2_in1.val[0], q2_in3.val[0]);
1026 q_s2_i = vaddq_f32 (q2_in1.val[1], q2_in3.val[1]);
1027 q_s3_r = vsubq_f32 (q2_in1.val[0], q2_in3.val[0]);
1028 q_s3_i = vsubq_f32 (q2_in1.val[1], q2_in3.val[1]);
1031 q_out2_r = vsubq_f32 (q_s0_r, q_s2_r);
1032 q_out2_i = vsubq_f32 (q_s0_i, q_s2_i);
1033 q_out0_r = vaddq_f32 (q_s0_r, q_s2_r);
1034 q_out0_i = vaddq_f32 (q_s0_i, q_s2_i);
1035 q_out1_r = vsubq_f32 (q_s1_r, q_s3_i);
1036 q_out1_i = vaddq_f32 (q_s1_i, q_s3_r);
1037 q_out3_r = vaddq_f32 (q_s1_r, q_s3_i);
1038 q_out3_i = vsubq_f32 (q_s1_i, q_s3_r);
1041 q2_tmp0 = vtrnq_f32 (q_out0_r, q_out1_r);
1042 q2_tmp1 = vtrnq_f32 (q_out0_i, q_out1_i);
1043 q2_tmp2 = vtrnq_f32 (q_out2_r, q_out3_r);
1044 q2_tmp3 = vtrnq_f32 (q_out2_i, q_out3_i);
1045 q2_out0.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[0]), vget_low_f32 (q2_tmp2.val[0]));
1046 q2_out0.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[0]), vget_low_f32 (q2_tmp3.val[0]));
1047 q2_out1.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[1]), vget_low_f32 (q2_tmp2.val[1]));
1048 q2_out1.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[1]), vget_low_f32 (q2_tmp3.val[1]));
1049 q2_out2.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[0]), vget_high_f32 (q2_tmp2.val[0]));
1050 q2_out2.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[0]), vget_high_f32 (q2_tmp3.val[0]));
1051 q2_out3.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[1]), vget_high_f32 (q2_tmp2.val[1]));
1052 q2_out3.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[1]), vget_high_f32 (q2_tmp3.val[1]));
1055 vst2q_f32 (p_dst, q2_out0);
1057 vst2q_f32 (p_dst, q2_out1);
1059 vst2q_f32 (p_dst, q2_out2);
1061 vst2q_f32 (p_dst, q2_out3);
1065 p_src = p_src - src_step * 4 + 8;
1080 float32_t *p_src = (float32_t *) in;
1081 float32_t *p_dst = (float32_t *) out;
1082 float32_t *p_tw = (float32_t *) tw;
1084 float32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3;
1085 float32x4x2_t q2_tw0, q2_tw1, q2_tw2;
1086 float32x4_t q_s1_r, q_s1_i, q_s2_r, q_s2_i, q_s3_r, q_s3_i;
1087 float32x4_t q_s4_r, q_s4_i, q_s5_r, q_s5_i, q_s6_r, q_s6_i, q_s7_r, q_s7_i;
1088 float32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3;
1092 for (m_count = 0; m_count < mstride; m_count += 4)
1095 q2_in0 = vld2q_f32 (p_src);
1097 q2_in1 = vld2q_f32 (p_src);
1099 q2_in2 = vld2q_f32 (p_src);
1101 q2_in3 = vld2q_f32 (p_src);
1105 q2_tw0 = vld2q_f32 (p_tw);
1107 q2_tw1 = vld2q_f32 (p_tw);
1109 q2_tw2 = vld2q_f32 (p_tw);
1112 q_s1_r = vmulq_f32 (q2_in1.val[0], q2_tw0.val[0]);
1113 q_s1_i = vmulq_f32 (q2_in1.val[1], q2_tw0.val[0]);
1114 q_s2_r = vmulq_f32 (q2_in2.val[0], q2_tw1.val[0]);
1115 q_s2_i = vmulq_f32 (q2_in2.val[1], q2_tw1.val[0]);
1116 q_s3_r = vmulq_f32 (q2_in3.val[0], q2_tw2.val[0]);
1117 q_s3_i = vmulq_f32 (q2_in3.val[1], q2_tw2.val[0]);
1118 q_s1_r = vmlaq_f32 (q_s1_r, q2_in1.val[1], q2_tw0.val[1]);
1119 q_s1_i = vmlsq_f32 (q_s1_i, q2_in1.val[0], q2_tw0.val[1]);
1120 q_s2_r = vmlaq_f32 (q_s2_r, q2_in2.val[1], q2_tw1.val[1]);
1121 q_s2_i = vmlsq_f32 (q_s2_i, q2_in2.val[0], q2_tw1.val[1]);
1122 q_s3_r = vmlaq_f32 (q_s3_r, q2_in3.val[1], q2_tw2.val[1]);
1123 q_s3_i = vmlsq_f32 (q_s3_i, q2_in3.val[0], q2_tw2.val[1]);
1127 q_s4_r = vaddq_f32 (q2_in0.val[0], q_s2_r);
1128 q_s4_i = vaddq_f32 (q2_in0.val[1], q_s2_i);
1129 q_s5_r = vsubq_f32 (q2_in0.val[0], q_s2_r);
1130 q_s5_i = vsubq_f32 (q2_in0.val[1], q_s2_i);
1131 q_s6_r = vaddq_f32 (q_s1_r, q_s3_r);
1132 q_s6_i = vaddq_f32 (q_s1_i, q_s3_i);
1133 q_s7_r = vsubq_f32 (q_s1_r, q_s3_r);
1134 q_s7_i = vsubq_f32 (q_s1_i, q_s3_i);
1137 q2_out2.val[0] = vsubq_f32 (q_s4_r, q_s6_r);
1138 q2_out2.val[1] = vsubq_f32 (q_s4_i, q_s6_i);
1139 q2_out0.val[0] = vaddq_f32 (q_s4_r, q_s6_r);
1140 q2_out0.val[1] = vaddq_f32 (q_s4_i, q_s6_i);
1141 q2_out1.val[0] = vsubq_f32 (q_s5_r, q_s7_i);
1142 q2_out1.val[1] = vaddq_f32 (q_s5_i, q_s7_r);
1143 q2_out3.val[0] = vaddq_f32 (q_s5_r, q_s7_i);
1144 q2_out3.val[1] = vsubq_f32 (q_s5_i, q_s7_r);
1147 vst2q_f32 (p_dst, q2_out0);
1149 vst2q_f32 (p_dst, q2_out1);
1151 vst2q_f32 (p_dst, q2_out2);
1153 vst2q_f32 (p_dst, q2_out3);
1157 p_src = p_src - src_step * 4 + 8;
1158 p_dst = p_dst - dst_step * 4 + 8;
1159 p_tw = p_tw - tw_step * 2 + 8;
1175 float32_t *p_src = (float32_t *) in;
1176 float32_t *p_dst = (float32_t *) out;
1177 float32_t *p_tw = (float32_t *) tw;
1180 float32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3;
1181 float32x4x2_t q2_tw0, q2_tw1, q2_tw2;
1182 float32x4_t q_s1_r, q_s1_i, q_s2_r, q_s2_i, q_s3_r, q_s3_i;
1183 float32x4_t q_s4_r, q_s4_i, q_s5_r, q_s5_i, q_s6_r, q_s6_i, q_s7_r, q_s7_i;
1184 float32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3;
1185 float32x4_t q_one_by_nfft = vdupq_n_f32 (one_by_nfft);
1189 for (m_count = 0; m_count < mstride; m_count += 4)
1192 q2_in0 = vld2q_f32 (p_src);
1194 q2_in1 = vld2q_f32 (p_src);
1196 q2_in2 = vld2q_f32 (p_src);
1198 q2_in3 = vld2q_f32 (p_src);
1202 q2_tw0 = vld2q_f32 (p_tw);
1204 q2_tw1 = vld2q_f32 (p_tw);
1206 q2_tw2 = vld2q_f32 (p_tw);
1209 q_s1_r = vmulq_f32 (q2_in1.val[0], q2_tw0.val[0]);
1210 q_s1_i = vmulq_f32 (q2_in1.val[1], q2_tw0.val[0]);
1211 q_s2_r = vmulq_f32 (q2_in2.val[0], q2_tw1.val[0]);
1212 q_s2_i = vmulq_f32 (q2_in2.val[1], q2_tw1.val[0]);
1213 q_s3_r = vmulq_f32 (q2_in3.val[0], q2_tw2.val[0]);
1214 q_s3_i = vmulq_f32 (q2_in3.val[1], q2_tw2.val[0]);
1215 q_s1_r = vmlaq_f32 (q_s1_r, q2_in1.val[1], q2_tw0.val[1]);
1216 q_s1_i = vmlsq_f32 (q_s1_i, q2_in1.val[0], q2_tw0.val[1]);
1217 q_s2_r = vmlaq_f32 (q_s2_r, q2_in2.val[1], q2_tw1.val[1]);
1218 q_s2_i = vmlsq_f32 (q_s2_i, q2_in2.val[0], q2_tw1.val[1]);
1219 q_s3_r = vmlaq_f32 (q_s3_r, q2_in3.val[1], q2_tw2.val[1]);
1220 q_s3_i = vmlsq_f32 (q_s3_i, q2_in3.val[0], q2_tw2.val[1]);
1224 q_s4_r = vaddq_f32 (q2_in0.val[0], q_s2_r);
1225 q_s4_i = vaddq_f32 (q2_in0.val[1], q_s2_i);
1226 q_s5_r = vsubq_f32 (q2_in0.val[0], q_s2_r);
1227 q_s5_i = vsubq_f32 (q2_in0.val[1], q_s2_i);
1228 q_s6_r = vaddq_f32 (q_s1_r, q_s3_r);
1229 q_s6_i = vaddq_f32 (q_s1_i, q_s3_i);
1230 q_s7_r = vsubq_f32 (q_s1_r, q_s3_r);
1231 q_s7_i = vsubq_f32 (q_s1_i, q_s3_i);
1234 q2_out2.val[0] = vsubq_f32 (q_s4_r, q_s6_r);
1235 q2_out2.val[1] = vsubq_f32 (q_s4_i, q_s6_i);
1236 q2_out0.val[0] = vaddq_f32 (q_s4_r, q_s6_r);
1237 q2_out0.val[1] = vaddq_f32 (q_s4_i, q_s6_i);
1238 q2_out1.val[0] = vsubq_f32 (q_s5_r, q_s7_i);
1239 q2_out1.val[1] = vaddq_f32 (q_s5_i, q_s7_r);
1240 q2_out3.val[0] = vaddq_f32 (q_s5_r, q_s7_i);
1241 q2_out3.val[1] = vsubq_f32 (q_s5_i, q_s7_r);
1244 q2_out0.val[0] = vmulq_f32 (q2_out0.val[0], q_one_by_nfft);
1245 q2_out0.val[1] = vmulq_f32 (q2_out0.val[1], q_one_by_nfft);
1246 q2_out1.val[0] = vmulq_f32 (q2_out1.val[0], q_one_by_nfft);
1247 q2_out1.val[1] = vmulq_f32 (q2_out1.val[1], q_one_by_nfft);
1248 q2_out2.val[0] = vmulq_f32 (q2_out2.val[0], q_one_by_nfft);
1249 q2_out2.val[1] = vmulq_f32 (q2_out2.val[1], q_one_by_nfft);
1250 q2_out3.val[0] = vmulq_f32 (q2_out3.val[0], q_one_by_nfft);
1251 q2_out3.val[1] = vmulq_f32 (q2_out3.val[1], q_one_by_nfft);
1254 vst2q_f32 (p_dst, q2_out0);
1256 vst2q_f32 (p_dst, q2_out1);
1258 vst2q_f32 (p_dst, q2_out2);
1260 vst2q_f32 (p_dst, q2_out3);
1264 p_src = p_src - src_step * 4 + 8;
1265 p_dst = p_dst - dst_step * 4 + 8;
1266 p_tw = p_tw - tw_step * 2 + 8;
1283 ne10_int32_t mstride = factors[(stage_count << 1) - 1];
1292 if (first_radix == 8)
1294 ne10_radix8x4_neon (dst, src, fstride);
1297 step = fstride << 1;
1301 else if (first_radix == 4)
1303 ne10_radix4x4_without_twiddles_neon (dst, src, fstride);
1316 for (; stage_count > 1 ; stage_count--)
1319 for (f_count = 0; f_count < fstride; f_count ++)
1321 dst = &out[f_count * (mstride * 4)];
1323 ne10_radix4x4_with_twiddles_neon (dst, src, tw, step, mstride, mstride);
1328 twiddles += mstride * 3;
1347 for (f_count = 0; f_count < fstride; f_count++)
1350 ne10_radix4x4_with_twiddles_neon (dst, src, tw, step, step, mstride);
1369 ne10_int32_t mstride = factors[(stage_count << 1) - 1];
1379 if (first_radix == 8)
1381 ne10_radix8x4_inverse_neon (dst, src, fstride);
1384 step = fstride << 1;
1388 else if (first_radix == 4)
1390 ne10_radix4x4_inverse_without_twiddles_neon (dst, src, fstride);
1403 for (; stage_count > 1; stage_count--)
1406 for (f_count = 0; f_count < fstride; f_count++)
1408 dst = &out[f_count * (mstride * 4)];
1410 ne10_radix4x4_inverse_with_twiddles_neon (dst, src, tw, step, mstride, mstride);
1415 twiddles += mstride * 3;
1434 for (f_count = 0; f_count < fstride; f_count++)
1437 ne10_radix4x4_inverse_with_twiddles_last_stage_neon (dst, src, tw, step, step, mstride, nfft);
1493 ne10_fft4_backward_float32 (fout, fin);
1496 ne10_fft8_backward_float32 (fout, fin);
1499 ne10_fft16_backward_float32_neon (fout, fin, cfg->
twiddles);
1511 ne10_fft4_forward_float32 (fout, fin);
1514 ne10_fft8_forward_float32 (fout, fin);
1517 ne10_fft16_forward_float32_neon (fout, fin, cfg->
twiddles);
#define NE10_FFT_ALG_DEFAULT
ne10_int32_t is_backward_scaled
Flag to control scaling behaviour in backward floating point complex FFT.
void ne10_mixed_radix_generic_butterfly_float32_neon(ne10_fft_cpx_float32_t *Fout, const ne10_fft_cpx_float32_t *Fin, const ne10_int32_t *factors, const ne10_fft_cpx_float32_t *twiddles, ne10_fft_cpx_float32_t *buffer, const ne10_int32_t is_scaled)
void ne10_fft_c2c_1d_float32_neon(ne10_fft_cpx_float32_t *fout, ne10_fft_cpx_float32_t *fin, ne10_fft_cfg_float32_t cfg, ne10_int32_t inverse_fft)
Specific implementation of ne10_fft_c2c_1d_float32 using NEON SIMD capabilities.
Structure for the floating point FFT state.
ne10_fft_cpx_float32_t * twiddles
void ne10_mixed_radix_generic_butterfly_inverse_float32_neon(ne10_fft_cpx_float32_t *Fout, const ne10_fft_cpx_float32_t *Fin, const ne10_int32_t *factors, const ne10_fft_cpx_float32_t *twiddles, ne10_fft_cpx_float32_t *buffer, const ne10_int32_t is_scaled)
void ne10_fft_c2c_1d_float32_c(ne10_fft_cpx_float32_t *fout, ne10_fft_cpx_float32_t *fin, ne10_fft_cfg_float32_t cfg, ne10_int32_t inverse_fft)
Specific implementation of ne10_fft_c2c_1d_float32 using plain C.
void ne10_mixed_radix_fft_backward_float32_neon(ne10_fft_cpx_float32_t *out, ne10_fft_cpx_float32_t *in, ne10_int32_t *factors, ne10_fft_cpx_float32_t *twiddles, ne10_fft_cpx_float32_t *buffer)
ne10_fft_cpx_float32_t * buffer
void ne10_mixed_radix_fft_forward_float32_neon(ne10_fft_cpx_float32_t *out, ne10_fft_cpx_float32_t *in, ne10_int32_t *factors, ne10_fft_cpx_float32_t *twiddles, ne10_fft_cpx_float32_t *buffer)
ne10_int32_t is_forward_scaled
Flag to control scaling behaviour in forward floating point complex FFT.