66 const float32x4_t *Fin_neon = (float32x4_t*) Fin;
67 float32x4_t *Fout_neon = (float32x4_t*) Fout;
69 for (f_count = fstride; f_count > 0; f_count --)
86 Fin_neon = Fin_neon - fstride * 8 + 1;
103 const float32x4_t one_by_N_neon = vdupq_n_f32(one_by_N);
105 const float32x4_t *Fin_neon = (float32x4_t*) Fin;
106 float32x4_t *Fout_neon = (float32x4_t*) Fout;
108 for (f_count = fstride; f_count > 0; f_count --)
119 #ifdef NE10_DSP_RFFT_SCALING 120 q_out0 = vmulq_f32(q_out0,one_by_N_neon);
121 q_out1 = vmulq_f32(q_out1,one_by_N_neon);
122 q_out2 = vmulq_f32(q_out2,one_by_N_neon);
123 q_out3 = vmulq_f32(q_out3,one_by_N_neon);
124 q_out4 = vmulq_f32(q_out4,one_by_N_neon);
125 q_out5 = vmulq_f32(q_out5,one_by_N_neon);
126 q_out6 = vmulq_f32(q_out6,one_by_N_neon);
127 q_out7 = vmulq_f32(q_out7,one_by_N_neon);
145 const float32x4_t *Fin_neon = (float32x4_t*) Fin;
146 float32x4_t *Fout_neon = (float32x4_t*) Fout;
148 for (f_count = 0; f_count < fstride; f_count ++)
161 Fin_neon = Fin_neon - 4*fstride + 1;
174 const float32x4_t *Fin_neon = (float32x4_t*) Fin;
175 float32x4_t *Fout_neon = (float32x4_t*) Fout;
178 const float32x4_t one_by_N_neon = vdupq_n_f32(one_by_N);
180 for (f_count = 0; f_count < fstride; f_count ++)
194 #ifdef NE10_DSP_RFFT_SCALING 195 q_out0 = vmulq_f32(q_out0,one_by_N_neon);
196 q_out1 = vmulq_f32(q_out1,one_by_N_neon);
197 q_out2 = vmulq_f32(q_out2,one_by_N_neon);
198 q_out3 = vmulq_f32(q_out3,one_by_N_neon);
209 const float32x4_t *Fin_neon,
224 vst1q_f32( (
ne10_float32_t*) (Fout_neon + (out_step << 1) - 1), q_out1);
225 vst1q_f32( (
ne10_float32_t*) (Fout_neon + (out_step << 1) ), q_out2);
226 vst1q_f32( (
ne10_float32_t*) (Fout_neon + 2 * (out_step << 1) - 1), q_out3);
230 const float32x4_t *Fin_neon,
240 q_in1 = vld1q_f32( (
ne10_float32_t*) (Fin_neon + (out_step << 1) - 1) );
241 q_in2 = vld1q_f32( (
ne10_float32_t*) (Fin_neon + (out_step << 1) ) );
242 q_in3 = vld1q_f32( (
ne10_float32_t*) (Fin_neon + 2 * (out_step << 1) - 1) );
255 const float32x4_t *Fin_neon,
262 float32x4_t *Fout_b = Fout_neon + (((out_step<<1)-1)<<1) - 2;
268 for (m_count = loop_count; m_count > 0; m_count -- )
270 #ifndef NE10_INLINE_ASM_OPT 272 q2_in0.val[0] = vld1q_f32( (
ne10_float32_t*) (Fin_neon + 0*in_step ) );
273 q2_in0.val[1] = vld1q_f32( (
ne10_float32_t*) (Fin_neon + 0*in_step + 1) );
275 q2_in1.val[0] = vld1q_f32( (
ne10_float32_t*) (Fin_neon + 1*in_step ) );
276 q2_in1.val[1] = vld1q_f32( (
ne10_float32_t*) (Fin_neon + 1*in_step + 1) );
278 q2_in2.val[0] = vld1q_f32( (
ne10_float32_t*) (Fin_neon + 2*in_step ) );
279 q2_in2.val[1] = vld1q_f32( (
ne10_float32_t*) (Fin_neon + 2*in_step + 1) );
281 q2_in3.val[0] = vld1q_f32( (
ne10_float32_t*) (Fin_neon + 3*in_step ) );
282 q2_in3.val[1] = vld1q_f32( (
ne10_float32_t*) (Fin_neon + 3*in_step + 1) );
284 q2_tw0.val[0] = vdupq_n_f32(twiddles[0].r);
285 q2_tw0.val[1] = vdupq_n_f32(twiddles[0].i);
287 q2_tw1.val[0] = vdupq_n_f32(twiddles[1].r);
288 q2_tw1.val[1] = vdupq_n_f32(twiddles[1].i);
290 q2_tw2.val[0] = vdupq_n_f32(twiddles[2].r);
291 q2_tw2.val[1] = vdupq_n_f32(twiddles[2].i);
295 #else // NE10_INLINE_ASM_OPT 297 #error Currently, inline assembly optimizations are only available on AArch64. 302 "ld1 {%[q2_out0r].4s}, [%[ptr_inr]], %[offset_in] \n\t" 303 "ld1 {%[q2_out0i].4s}, [%[ptr_ini]] \n\t" 304 "ld1 {v10.4s, v11.4s}, [%[ptr_inr]], %[offset_in] \n\t" 305 "ld1 {v12.4s, v13.4s}, [%[ptr_inr]], %[offset_in] \n\t" 306 "ld1 {v14.4s, v15.4s}, [%[ptr_inr]] \n\t" 307 "ld1 {v0.1d, v1.1d, v2.1d}, [%[ptr_tw]] \n\t" 309 "fmul %[q2_out1r].4s, v10.4s, v0.4s[0] \n\t" 310 "fmul %[q2_out1i].4s, v10.4s, v0.4s[1] \n\t" 311 "fmls %[q2_out1r].4s, v11.4s, v0.4s[1] \n\t" 312 "fmla %[q2_out1i].4s, v11.4s, v0.4s[0] \n\t" 314 "fmul %[q2_out2r].4s, v12.4s, v1.4s[0] \n\t" 315 "fmul %[q2_out2i].4s, v12.4s, v1.4s[1] \n\t" 316 "fmls %[q2_out2r].4s, v13.4s, v1.4s[1] \n\t" 317 "fmla %[q2_out2i].4s, v13.4s, v1.4s[0] \n\t" 319 "fmul %[q2_out3r].4s, v14.4s, v2.4s[0] \n\t" 320 "fmul %[q2_out3i].4s, v14.4s, v2.4s[1] \n\t" 321 "fmls %[q2_out3r].4s, v15.4s, v2.4s[1] \n\t" 322 "fmla %[q2_out3i].4s, v15.4s, v2.4s[0] \n\t" 323 : [q2_out0r]
"+w"(q2_out0.val[0]),
324 [q2_out0i]
"+w"(q2_out0.val[1]),
325 [q2_out1r]
"+w"(q2_out1.val[0]),
326 [q2_out1i]
"+w"(q2_out1.val[1]),
327 [q2_out2r]
"+w"(q2_out2.val[0]),
328 [q2_out2i]
"+w"(q2_out2.val[1]),
329 [q2_out3r]
"+w"(q2_out3.val[0]),
330 [q2_out3i]
"+w"(q2_out3.val[1]),
331 [ptr_inr]
"+r"(ptr_inr),
332 [ptr_ini]
"+r"(ptr_ini)
333 : [offset_in]
"r"(in_step * 16),
334 [ptr_tw]
"r"(twiddles)
335 :
"memory",
"v0",
"v1",
"v2",
336 "v10",
"v11",
"v12",
"v13",
"v14",
"v15" 338 #endif // __aarch64__ 339 #endif // NE10_INLINE_ASM_OPT 348 vst1q_f32( (
ne10_float32_t*) ( Fout_neon + (out_step << 1) ), q2_out1.val[0] );
349 vst1q_f32( (
ne10_float32_t*) ( Fout_neon + (out_step << 1) + 1), q2_out1.val[1] );
354 vst1q_f32( (
ne10_float32_t*) ( Fout_b - (out_step << 1) ), q2_out3.val[0] );
355 vst1q_f32( (
ne10_float32_t*) ( Fout_b - (out_step << 1) + 1), q2_out3.val[1] );
366 const float32x4_t *Fin_neon,
373 const float32x4_t *Fin_b = Fin_neon + (((out_step<<1)-1)<<1) - 2;
379 for (m_count = loop_count; m_count > 0; m_count -- )
385 q2_in1.val[0] = vld1q_f32( (
ne10_float32_t*) ( Fin_neon + (out_step << 1) ) );
386 q2_in1.val[1] = vld1q_f32( (
ne10_float32_t*) ( Fin_neon + (out_step << 1) + 1) );
391 q2_in3.val[0] = vld1q_f32( (
ne10_float32_t*) ( Fin_b - (out_step << 1) ) );
392 q2_in3.val[1] = vld1q_f32( (
ne10_float32_t*) ( Fin_b - (out_step << 1) + 1) );
394 q2_tw0.val[0] = vdupq_n_f32(twiddles[0].r);
395 q2_tw0.val[1] = vdupq_n_f32(twiddles[0].i);
397 q2_tw1.val[0] = vdupq_n_f32(twiddles[1].r);
398 q2_tw1.val[1] = vdupq_n_f32(twiddles[1].i);
400 q2_tw2.val[0] = vdupq_n_f32(twiddles[2].r);
401 q2_tw2.val[1] = vdupq_n_f32(twiddles[2].i);
411 vst1q_f32( (
ne10_float32_t*) (Fout_neon + 0*in_step ), q2_out0.val[0] );
412 vst1q_f32( (
ne10_float32_t*) (Fout_neon + 0*in_step + 1), q2_out0.val[1] );
414 vst1q_f32( (
ne10_float32_t*) (Fout_neon + 1*in_step ), q2_out1.val[0] );
415 vst1q_f32( (
ne10_float32_t*) (Fout_neon + 1*in_step + 1), q2_out1.val[1] );
417 vst1q_f32( (
ne10_float32_t*) (Fout_neon + 2*in_step ), q2_out2.val[0] );
418 vst1q_f32( (
ne10_float32_t*) (Fout_neon + 2*in_step + 1), q2_out2.val[1] );
420 vst1q_f32( (
ne10_float32_t*) (Fout_neon + 3*in_step ), q2_out3.val[0] );
421 vst1q_f32( (
ne10_float32_t*) (Fout_neon + 3*in_step + 1), q2_out3.val[1] );
432 const float32x4_t *Fin_neon,
448 vst1q_f32( (
ne10_float32_t*) (Fout_neon + (out_step << 1) ), q_out2);
449 vst1q_f32( (
ne10_float32_t*) (Fout_neon + (out_step << 1) + 1), q_out3);
453 const float32x4_t *Fin_neon,
464 q_in2 = vld1q_f32( (
ne10_float32_t*) (Fin_neon + (out_step << 1) ) );
465 q_in3 = vld1q_f32( (
ne10_float32_t*) (Fin_neon + (out_step << 1) + 1) );
488 const float32x4_t *Fin_neon = (float32x4_t*) Fin;
489 float32x4_t *Fout_neon = (float32x4_t*) Fout;
492 for (f_count = fstride; f_count; f_count --)
507 Fin_neon += 2 * ( (out_step >> 1) - 1);
508 Fout_neon += 2 * ( (out_step >> 1) - 1);
515 Fout_neon = Fout_neon + 3 * out_step;
530 const float32x4_t *Fin_neon = (float32x4_t*) Fin;
531 float32x4_t *Fout_neon = (float32x4_t*) Fout;
534 for (f_count = fstride; f_count; f_count --)
549 Fin_neon += 2 * ( (out_step >> 1) - 1);
550 Fout_neon += 2 * ( (out_step >> 1) - 1);
557 Fin_neon = Fin_neon + 3 * out_step;
574 stage_count = factors[0];
575 fstride = factors[1];
576 mstride = factors[ (stage_count << 1) - 1 ];
577 radix = factors[ stage_count << 1 ];
578 nfft = radix * fstride;
582 if (stage_count % 2 == 1)
605 twiddles += 3 * mstride;
623 stage_count = factors[0];
624 fstride = factors[1];
626 mstride = factors[ (stage_count << 1) - 1 ];
627 radix = factors[ stage_count << 1 ];
628 nfft = radix * fstride;
634 if (stage_count % 2 == 0)
640 for (; stage_count > 1;)
642 twiddles -= 3 * mstride;
681 dst[0].
r = q_4r_out[0];
682 dst[0].
i = q_4r_out[3];
684 dst[0].
r = q_4r_out[1];
685 dst[0].
i = q_4r_out[2];
700 dst[0].
r = q_4r_out[0];
701 dst[0].
i = q_4r_out[1];
703 dst[0].
r = q_4r_out[2];
704 dst[0].
i = q_4r_out[3];
715 cc_out[0].
r = *(p_src_r ++);
716 cc_out[1].
r = *(p_src_r ++);
717 cc_out[2].
r = *(p_src_r ++);
718 cc_out[3].
r = *(p_src_r ++);
720 cc_out[0].
i = *(p_src_r ++);
721 cc_out[1].
i = *(p_src_r ++);
722 cc_out[2].
i = *(p_src_r ++);
723 cc_out[3].
i = *(p_src_r ++);
729 cc_in[0] = cc_out[0];
764 q_4r_in[0] = src[0].
r;
765 q_4r_in[3] = src[0].
i;
767 q_4r_in[1] = src[0].
r;
768 q_4r_in[2] = src[0].
i;
783 q_4r_in[0] = src[0].
r;
784 q_4r_in[1] = src[0].
i;
786 q_4r_in[2] = src[0].
r;
787 q_4r_in[3] = src[0].
i;
816 cc_out[0] = cc_in[0];
829 *(p_dst_r ++) = cc_out[0].r;
830 *(p_dst_r ++) = cc_out[1].r;
831 *(p_dst_r ++) = cc_out[2].r;
832 *(p_dst_r ++) = cc_out[3].r;
834 *(p_dst_r ++) = cc_out[0].i;
835 *(p_dst_r ++) = cc_out[1].i;
836 *(p_dst_r ++) = cc_out[2].i;
837 *(p_dst_r ++) = cc_out[3].i;
865 q_in0[0] = *(fin_r++);
866 q_in0[1] = *(fin_r++);
867 q_in0[2] = *(fin_r++);
868 q_in0[3] = *(fin_r++);
869 q_in1[0] = *(fin_r++);
870 q_in1[1] = *(fin_r++);
871 q_in1[2] = *(fin_r++);
872 q_in1[3] = *(fin_r++);
873 q_in2[0] = *(fin_r++);
874 q_in2[1] = *(fin_r++);
875 q_in2[2] = *(fin_r++);
876 q_in2[3] = *(fin_r++);
877 q_in3[0] = *(fin_r++);
878 q_in3[1] = *(fin_r++);
879 q_in3[2] = *(fin_r++);
880 q_in3[3] = *(fin_r++);
887 q2_tw0[0][0] = tw[0];
888 q2_tw0[0][1] = tw[2];
889 q2_tw0[0][2] = tw[4];
890 q2_tw0[0][3] = tw[6];
891 q2_tw0[1][0] = tw[1];
892 q2_tw0[1][1] = tw[3];
893 q2_tw0[1][2] = tw[5];
894 q2_tw0[1][3] = tw[7];
896 q2_tw1[0][0] = tw[0+8];
897 q2_tw1[0][1] = tw[2+8];
898 q2_tw1[0][2] = tw[4+8];
899 q2_tw1[0][3] = tw[6+8];
900 q2_tw1[1][0] = tw[1+8];
901 q2_tw1[1][1] = tw[3+8];
902 q2_tw1[1][2] = tw[5+8];
903 q2_tw1[1][3] = tw[7+8];
906 q_out0[0] = q_in0[0];
907 q_out1[0] = q_in1[0];
908 q_out2[0] = q_in2[0];
909 q_out3[0] = q_in3[0];
914 q_out0[1] = q_in0[1] * q2_tw0[0][1] - q_in1[1] * q2_tw0[1][1];
916 q_out1[1] = q_in0[1] * q2_tw0[1][1] + q_in1[1] * q2_tw0[0][1];
919 q_out0[2] = q_in0[2] * q2_tw0[0][2] - q_in1[2] * q2_tw0[1][2];
921 q_out1[2] = q_in0[2] * q2_tw0[1][2] + q_in1[2] * q2_tw0[0][2];
924 q_out0[3] = q_in0[3] * q2_tw0[0][3] - q_in1[3] * q2_tw0[1][3];
926 q_out1[3] = q_in0[3] * q2_tw0[1][3] + q_in1[3] * q2_tw0[0][3];
931 q_out2[1] = q_in2[1] * q2_tw1[0][1] - q_in3[1] * q2_tw1[1][1];
933 q_out3[1] = q_in2[1] * q2_tw1[1][1] + q_in3[1] * q2_tw1[0][1];
936 q_out2[2] = q_in2[2] * q2_tw1[0][2] - q_in3[2] * q2_tw1[1][2];
938 q_out3[2] = q_in2[2] * q2_tw1[1][2] + q_in3[2] * q2_tw1[0][2];
941 q_out2[3] = q_in2[3] * q2_tw1[0][3] - q_in3[3] * q2_tw1[1][3];
943 q_out3[3] = q_in2[3] * q2_tw1[1][3] + q_in3[3] * q2_tw1[0][3];
954 q_in0[0] = q_out0[0] + q_out0[2];
955 q_in1[0] = q_out1[0] + q_out1[2];
957 q_in0[1] = q_out0[0] - q_out0[2];
958 q_in1[1] = q_out1[0] - q_out1[2];
961 q_in0[2] = q_out0[1] + q_out0[3];
962 q_in1[2] = q_out1[1] + q_out1[3];
964 q_in0[3] = q_out0[1] - q_out0[3];
965 q_in1[3] = q_out1[1] - q_out1[3];
968 q_in2[0] = q_out2[0] + q_out2[2];
969 q_in3[0] = q_out3[0] + q_out3[2];
971 q_in2[1] = q_out2[0] - q_out2[2];
972 q_in3[1] = q_out3[0] - q_out3[2];
975 q_in2[2] = q_out2[1] + q_out2[3];
976 q_in3[2] = q_out3[1] + q_out3[3];
978 q_in2[3] = q_out2[1] - q_out2[3];
979 q_in3[3] = q_out3[1] - q_out3[3];
990 q_out0[0] = q_in0[0] + q_in0[2];
991 q_out0[1] = q_in1[0] + q_in1[2];
993 q_out2[2] = q_in0[0] - q_in0[2];
994 q_out2[3] = - q_in1[0] + q_in1[2];
997 q_out3[2] = q_in0[1] - q_in1[3];
998 q_out3[3] = - q_in1[1] - q_in0[3];
1000 q_out1[0] = q_in0[1] + q_in1[3];
1001 q_out1[1] = q_in1[1] - q_in0[3];
1004 q_out0[2] = q_in2[0] + q_in2[2];
1005 q_out0[3] = q_in3[0] + q_in3[2];
1007 q_out2[0] = q_in2[0] - q_in2[2];
1008 q_out2[1] = - q_in3[0] + q_in3[2];
1011 q_out3[0] = q_in2[1] - q_in3[3];
1012 q_out3[1] = - q_in3[1] - q_in2[3];
1014 q_out1[2] = q_in2[1] + q_in3[3];
1015 q_out1[3] = q_in3[1] - q_in2[3];
1024 fout_r[0] = q_out0[0];
1025 fout_r[1] = q_out0[1];
1026 fout_r[2] = q_out0[2];
1027 fout_r[3] = q_out0[3];
1029 fout_r += (nfft>>1);
1030 fout_r[0] = q_out1[0];
1031 fout_r[1] = q_out1[1];
1032 fout_r[2] = q_out1[2];
1033 fout_r[3] = q_out1[3];
1036 fout_r[0] = q_out3[0];
1037 fout_r[1] = q_out3[1];
1038 fout_r[2] = q_out3[2];
1039 fout_r[3] = q_out3[3];
1041 fout_r += (nfft>>1);
1042 fout_r[0] = q_out2[0];
1043 fout_r[1] = q_out2[1];
1044 fout_r[2] = q_out2[2];
1045 fout_r[3] = q_out2[3];
1058 q_in1[4], q_out1[4],
1059 q_in2[4], q_out2[4],
1060 q_in3[4], q_out3[4];
1074 q_in0[0] = fin_r[0];
1075 q_in0[1] = fin_r[1];
1076 q_in0[2] = fin_r[2];
1077 q_in0[3] = fin_r[3];
1080 q_in1[0] = fin_r[0];
1081 q_in1[1] = fin_r[1];
1082 q_in1[2] = fin_r[2];
1083 q_in1[3] = fin_r[3];
1086 q_in3[0] = fin_r[0];
1087 q_in3[1] = fin_r[1];
1088 q_in3[2] = fin_r[2];
1089 q_in3[3] = fin_r[3];
1092 q_in2[0] = fin_r[0];
1093 q_in2[1] = fin_r[1];
1094 q_in2[2] = fin_r[2];
1095 q_in2[3] = fin_r[3];
1104 #define NE10_INV_BUTTERFLY_TMP(I1,I2,J1,J2,K1,K2,S1,S2) do { \ 1105 q_out ## I1 [I2] = ( q_in ## K1 [K2] + q_in ## S1 [S2] ); \ 1106 q_out ## J1 [J2] = ( q_in ## K1 [K2] - q_in ## S1 [S2] ); \ 1137 #undef NE10_INV_BUTTERFLY_TMP 1150 #define NE10_INV_BUTTERFLY_TMP(I1,I2,J1,J2,K1,K2,S1,S2) do { \ 1151 q_in ## I1 [I2] = ( q_out ## K1 [K2] + q_out ## S1 [S2] ); \ 1152 q_in ## J1 [J2] = ( q_out ## K1 [K2] - q_out ## S1 [S2] ); \ 1184 #undef NE10_INV_BUTTERFLY_TMP 1187 q2_tw0[0][0] = tw[0];
1188 q2_tw0[0][1] = tw[2];
1189 q2_tw0[0][2] = tw[4];
1190 q2_tw0[0][3] = tw[6];
1191 q2_tw0[1][0] = tw[1];
1192 q2_tw0[1][1] = tw[3];
1193 q2_tw0[1][2] = tw[5];
1194 q2_tw0[1][3] = tw[7];
1196 q2_tw1[0][0] = tw[0+8];
1197 q2_tw1[0][1] = tw[2+8];
1198 q2_tw1[0][2] = tw[4+8];
1199 q2_tw1[0][3] = tw[6+8];
1200 q2_tw1[1][0] = tw[1+8];
1201 q2_tw1[1][1] = tw[3+8];
1202 q2_tw1[1][2] = tw[5+8];
1203 q2_tw1[1][3] = tw[7+8];
1206 q_out0[0] = q_in0[0];
1207 q_out1[0] = q_in1[0];
1208 q_out2[0] = q_in2[0];
1209 q_out3[0] = q_in3[0];
1214 q_out0[1] = q_in0[1] * q2_tw0[0][1] + q_in1[1] * q2_tw0[1][1];
1216 q_out1[1] = q_in0[1] * q2_tw0[1][1] - q_in1[1] * q2_tw0[0][1];
1219 q_out0[2] = q_in0[2] * q2_tw0[0][2] + q_in1[2] * q2_tw0[1][2];
1221 q_out1[2] = q_in0[2] * q2_tw0[1][2] - q_in1[2] * q2_tw0[0][2];
1224 q_out0[3] = q_in0[3] * q2_tw0[0][3] + q_in1[3] * q2_tw0[1][3];
1226 q_out1[3] = q_in0[3] * q2_tw0[1][3] - q_in1[3] * q2_tw0[0][3];
1231 q_out2[1] = q_in2[1] * q2_tw1[0][1] + q_in3[1] * q2_tw1[1][1];
1233 q_out3[1] = q_in2[1] * q2_tw1[1][1] - q_in3[1] * q2_tw1[0][1];
1236 q_out2[2] = q_in2[2] * q2_tw1[0][2] + q_in3[2] * q2_tw1[1][2];
1238 q_out3[2] = q_in2[2] * q2_tw1[1][2] - q_in3[2] * q2_tw1[0][2];
1241 q_out2[3] = q_in2[3] * q2_tw1[0][3] + q_in3[3] * q2_tw1[1][3];
1243 q_out3[3] = q_in2[3] * q2_tw1[1][3] - q_in3[3] * q2_tw1[0][3];
1246 *(fout_r++) = q_out0[0];
1247 *(fout_r++) = q_out0[1];
1248 *(fout_r++) = q_out0[2];
1249 *(fout_r++) = q_out0[3];
1250 *(fout_r++) = q_out1[0];
1251 *(fout_r++) = - q_out1[1];
1252 *(fout_r++) = - q_out1[2];
1253 *(fout_r++) = - q_out1[3];
1254 *(fout_r++) = q_out2[0];
1255 *(fout_r++) = q_out2[1];
1256 *(fout_r++) = q_out2[2];
1257 *(fout_r++) = q_out2[3];
1258 *(fout_r++) = q_out3[0];
1259 *(fout_r++) = - q_out3[1];
1260 *(fout_r++) = - q_out3[2];
1261 *(fout_r++) = - q_out3[3];
1279 for (; loop_count > 0; loop_count--)
1312 #ifndef NE10_INLINE_ASM_OPT 1313 q2_out0.val[0] = vld1q_f32 (fin_r);
1315 q2_out0.val[1] = vld1q_f32 (fin_r);
1317 q2_out1.val[0] = vld1q_f32 (fin_r);
1319 q2_out1.val[1] = vld1q_f32 (fin_r);
1321 q2_out2.val[0] = vld1q_f32 (fin_r);
1323 q2_out2.val[1] = vld1q_f32 (fin_r);
1325 q2_out3.val[0] = vld1q_f32 (fin_r);
1327 q2_out3.val[1] = vld1q_f32 (fin_r);
1331 #else // NE10_INLINE_ASM_OPT 1333 #error Currently, inline assembly optimizations are only available on AArch64. 1334 #else // __aarch64__ 1336 "ld1 {v0.4s}, [%[fin_r]], 16 \n\t" 1337 "ld1 {v4.4s}, [%[fin_r]], 16 \n\t" 1338 "ld1 {v1.4s}, [%[fin_r]], 16 \n\t" 1339 "ld1 {v5.4s}, [%[fin_r]], 16 \n\t" 1340 "ld1 {v2.4s}, [%[fin_r]], 16 \n\t" 1341 "ld1 {v6.4s}, [%[fin_r]], 16 \n\t" 1342 "ld1 {v3.4s}, [%[fin_r]], 16 \n\t" 1343 "ld1 {v7.4s}, [%[fin_r]], 16 \n\t" 1345 "trn1 v8.4s, v0.4s, v1.4s \n\t" 1346 "trn2 v9.4s, v0.4s, v1.4s \n\t" 1347 "trn1 v10.4s, v2.4s, v3.4s \n\t" 1348 "trn2 v11.4s, v2.4s, v3.4s \n\t" 1350 "trn1 %[q2_in0r].2d, v8.2d, v10.2d \n\t" 1351 "trn1 %[q2_in1r].2d, v9.2d, v11.2d \n\t" 1352 "trn2 %[q2_in2r].2d, v8.2d, v10.2d \n\t" 1353 "trn2 %[q2_in3r].2d, v9.2d, v11.2d \n\t" 1355 "trn1 v8.4s, v4.4s, v5.4s \n\t" 1356 "trn2 v9.4s, v4.4s, v5.4s \n\t" 1357 "trn1 v10.4s, v6.4s, v7.4s \n\t" 1358 "trn2 v11.4s, v6.4s, v7.4s \n\t" 1360 "trn1 %[q2_in0i].2d, v8.2d, v10.2d \n\t" 1361 "trn1 %[q2_in1i].2d, v9.2d, v11.2d \n\t" 1362 "trn2 %[q2_in2i].2d, v8.2d, v10.2d \n\t" 1363 "trn2 %[q2_in3i].2d, v9.2d, v11.2d \n\t" 1365 : [q2_in0r]
"+w"(q2_in0.val[0]),
1366 [q2_in0i]
"+w"(q2_in0.val[1]),
1367 [q2_in1r]
"+w"(q2_in1.val[0]),
1368 [q2_in1i]
"+w"(q2_in1.val[1]),
1369 [q2_in2r]
"+w"(q2_in2.val[0]),
1370 [q2_in2i]
"+w"(q2_in2.val[1]),
1371 [q2_in3r]
"+w"(q2_in3.val[0]),
1372 [q2_in3i]
"+w"(q2_in3.val[1]),
1375 :
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
1376 "v8",
"v9",
"v10",
"v11" 1378 #endif // __aarch64__ 1379 #endif // NE10_INLINE_ASM_OPT 1381 #ifndef NE10_INLINE_ASM_OPT 1383 q2_tw0 = vld2q_f32 (tw);
1385 q2_tw1 = vld2q_f32 (tw);
1387 q2_tw2 = vld2q_f32 (tw);
1396 #else // NE10_INLINE_ASM_OPT 1398 #error Currently, inline assembly optimizations are only available on AArch64. 1399 #else // __aarch64__ 1402 "ld2 {v0.4s, v1.4s}, [%[tw0]] \n\t" 1403 "ld2 {v2.4s, v3.4s}, [%[tw1]] \n\t" 1404 "ld2 {v4.4s, v5.4s}, [%[tw2]] \n\t" 1408 "fmul %[q2_out1r].4s, v0.4s, %[q2_in1r].4s \n\t" 1409 "fmul %[q2_out1i].4s, v0.4s, %[q2_in1i].4s \n\t" 1410 "fmls %[q2_out1r].4s, v1.4s, %[q2_in1i].4s \n\t" 1411 "fmla %[q2_out1i].4s, v1.4s, %[q2_in1r].4s \n\t" 1413 "fmul %[q2_out2r].4s, v2.4s, %[q2_in2r].4s \n\t" 1414 "fmul %[q2_out2i].4s, v2.4s, %[q2_in2i].4s \n\t" 1415 "fmls %[q2_out2r].4s, v3.4s, %[q2_in2i].4s \n\t" 1416 "fmla %[q2_out2i].4s, v3.4s, %[q2_in2r].4s \n\t" 1418 "fmul %[q2_out3r].4s, v4.4s, %[q2_in3r].4s \n\t" 1419 "fmul %[q2_out3i].4s, v4.4s, %[q2_in3i].4s \n\t" 1420 "fmls %[q2_out3r].4s, v5.4s, %[q2_in3i].4s \n\t" 1421 "fmla %[q2_out3i].4s, v5.4s, %[q2_in3r].4s \n\t" 1422 : [q2_out1r]
"+w"(q2_out1.val[0]),
1423 [q2_out1i]
"+w"(q2_out1.val[1]),
1424 [q2_out2r]
"+w"(q2_out2.val[0]),
1425 [q2_out2i]
"+w"(q2_out2.val[1]),
1426 [q2_out3r]
"+w"(q2_out3.val[0]),
1427 [q2_out3i]
"+w"(q2_out3.val[1])
1431 [q2_in1r]
"w"(q2_in1.val[0]),
1432 [q2_in1i]
"w"(q2_in1.val[1]),
1433 [q2_in2r]
"w"(q2_in2.val[0]),
1434 [q2_in2i]
"w"(q2_in2.val[1]),
1435 [q2_in3r]
"w"(q2_in3.val[0]),
1436 [q2_in3i]
"w"(q2_in3.val[1])
1437 :
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5" 1441 #endif // __aarch64__ 1442 #endif // NE10_INLINE_ASM_OPT 1446 q2_in0.val[0] = vaddq_f32 (q2_out0.val[0], q2_out2.val[0]);
1447 q2_in0.val[1] = vaddq_f32 (q2_out0.val[1], q2_out2.val[1]);
1448 q2_in1.val[0] = vsubq_f32 (q2_out0.val[0], q2_out2.val[0]);
1449 q2_in1.val[1] = vsubq_f32 (q2_out0.val[1], q2_out2.val[1]);
1450 q2_in2.val[0] = vaddq_f32 (q2_out1.val[0], q2_out3.val[0]);
1451 q2_in2.val[1] = vaddq_f32 (q2_out1.val[1], q2_out3.val[1]);
1452 q2_in3.val[0] = vsubq_f32 (q2_out1.val[0], q2_out3.val[0]);
1453 q2_in3.val[1] = vsubq_f32 (q2_out1.val[1], q2_out3.val[1]);
1456 q2_out2.val[0] = vsubq_f32 (q2_in0.val[0], q2_in2.val[0]);
1457 q2_out2.val[1] = vsubq_f32 (q2_in0.val[1], q2_in2.val[1]);
1458 q2_out3.val[0] = vsubq_f32 (q2_in1.val[0], q2_in3.val[1]);
1459 q2_out3.val[1] = vaddq_f32 (q2_in1.val[1], q2_in3.val[0]);
1461 q2_out3.val[1] = vnegq_f32 (q2_out3.val[1]);
1462 q2_out2.val[1] = vnegq_f32 (q2_out2.val[1]);
1464 #ifndef NE10_INLINE_ASM_OPT 1465 q2_out0.val[0] = vaddq_f32 (q2_in0.val[0], q2_in2.val[0]);
1466 q2_out0.val[1] = vaddq_f32 (q2_in0.val[1], q2_in2.val[1]);
1468 q2_out1.val[0] = vaddq_f32 (q2_in1.val[0], q2_in3.val[1]);
1469 q2_out1.val[1] = vsubq_f32 (q2_in1.val[1], q2_in3.val[0]);
1478 vst2q_f32 (fout_r, q2_out0);
1479 vst2q_f32 (fout_r + (nfft >> 1), q2_out1);
1480 vst2q_f32 (fout_b + (nfft >> 1), q2_out3);
1481 vst2q_f32 (fout_b + nfft, q2_out2);
1482 #else // NE10_INLINE_ASM_OPT 1484 #error Currently, inline assembly optimizations are only available on AArch64. 1485 #else // __aarch64__ 1487 "fadd v0.4s, %[q2_in0r].4s, %[q2_in2r].4s \n\t" 1488 "fadd v1.4s, %[q2_in0i].4s, %[q2_in2i].4s \n\t" 1489 "fadd v2.4s, %[q2_in1r].4s, %[q2_in3i].4s \n\t" 1490 "fsub v3.4s, %[q2_in1i].4s, %[q2_in3r].4s \n\t" 1492 "rev64 %[q2_in2r].4s, %[q2_out2r].4s \n\t" 1493 "rev64 %[q2_in2i].4s, %[q2_out2i].4s \n\t" 1494 "rev64 %[q2_in3r].4s, %[q2_out3r].4s \n\t" 1495 "rev64 %[q2_in3i].4s, %[q2_out3i].4s \n\t" 1496 "ext v4.16b, %[q2_in2r].16b, %[q2_in2r].16b, #8 \n\t" 1497 "ext v5.16b, %[q2_in2i].16b, %[q2_in2i].16b, #8 \n\t" 1498 "ext v6.16b, %[q2_in3r].16b, %[q2_in3r].16b, #8 \n\t" 1499 "ext v7.16b, %[q2_in3i].16b, %[q2_in3i].16b, #8 \n\t" 1501 "st2 {v0.4s, v1.4s}, [%[fout0]] \n\t" 1502 "st2 {v2.4s, v3.4s}, [%[fout1]] \n\t" 1503 "st2 {v4.4s, v5.4s}, [%[fout2]] \n\t" 1504 "st2 {v6.4s, v7.4s}, [%[fout3]] \n\t" 1506 : [fout0]
"r"(fout_r),
1507 [fout1]
"r"(fout_r + (nfft>>1)),
1508 [fout2]
"r"(fout_b + nfft),
1509 [fout3]
"r"(fout_b + (nfft>>1)),
1510 [q2_out2r]
"w"(q2_out2.val[0]),
1511 [q2_out2i]
"w"(q2_out2.val[1]),
1512 [q2_out3r]
"w"(q2_out3.val[0]),
1513 [q2_out3i]
"w"(q2_out3.val[1]),
1514 [q2_in0r]
"w"(q2_in0.val[0]),
1515 [q2_in0i]
"w"(q2_in0.val[1]),
1516 [q2_in1r]
"w"(q2_in1.val[0]),
1517 [q2_in1i]
"w"(q2_in1.val[1]),
1518 [q2_in2r]
"w"(q2_in2.val[0]),
1519 [q2_in2i]
"w"(q2_in2.val[1]),
1520 [q2_in3r]
"w"(q2_in3.val[0]),
1521 [q2_in3i]
"w"(q2_in3.val[1])
1522 :
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7" 1524 #endif // __aarch64__ 1525 #endif // NE10_INLINE_ASM_OPT 1543 for ( ; loop_count>0; loop_count -- )
1560 q2_in0 = vld2q_f32(fin_r );
1561 q2_in1 = vld2q_f32(fin_r + (nfft>>1));
1564 q2_in3 = vld2q_f32(fin_b + (nfft>>1));
1565 q2_in2 = vld2q_f32(fin_b + nfft );
1568 q2_tw0 = vld2q_f32(tw);
1570 q2_tw1 = vld2q_f32(tw);
1572 q2_tw2 = vld2q_f32(tw);
1581 q2_in2.val[1] = vnegq_f32( q2_in2.val[1] );
1582 q2_in3.val[1] = vnegq_f32( q2_in3.val[1] );
1585 q2_out0.val[0] = vaddq_f32 (q2_in0.val[0], q2_in2.val[0]);
1586 q2_out2.val[0] = vsubq_f32 (q2_in0.val[0], q2_in2.val[0]);
1588 q2_out0.val[1] = vaddq_f32 (q2_in0.val[1], q2_in2.val[1]);
1589 q2_out2.val[1] = vsubq_f32 (q2_in0.val[1], q2_in2.val[1]);
1591 q2_out1.val[0] = vaddq_f32 (q2_in1.val[0], q2_in3.val[0]);
1592 q2_out3.val[1] = vsubq_f32 (q2_in1.val[0], q2_in3.val[0]);
1594 q2_out1.val[1] = vaddq_f32 (q2_in3.val[1], q2_in1.val[1]);
1595 q2_out3.val[0] = vsubq_f32 (q2_in3.val[1], q2_in1.val[1]);
1598 q2_in0.val[0] = vaddq_f32 (q2_out0.val[0], q2_out1.val[0]);
1599 q2_in2.val[0] = vsubq_f32 (q2_out0.val[0], q2_out1.val[0]);
1601 q2_in0.val[1] = vaddq_f32 (q2_out0.val[1], q2_out1.val[1]);
1602 q2_in2.val[1] = vsubq_f32 (q2_out0.val[1], q2_out1.val[1]);
1604 q2_in1.val[0] = vaddq_f32 (q2_out2.val[0], q2_out3.val[0]);
1605 q2_in3.val[0] = vsubq_f32 (q2_out2.val[0], q2_out3.val[0]);
1607 q2_in1.val[1] = vaddq_f32 (q2_out2.val[1], q2_out3.val[1]);
1608 q2_in3.val[1] = vsubq_f32 (q2_out2.val[1], q2_out3.val[1]);
1622 vst1q_f32(fout_r, q2_in0.val[0]);
1624 vst1q_f32(fout_r, q2_in0.val[1]);
1626 vst1q_f32(fout_r, q2_in1.val[0]);
1628 vst1q_f32(fout_r, q2_in1.val[1]);
1630 vst1q_f32(fout_r, q2_in2.val[0]);
1632 vst1q_f32(fout_r, q2_in2.val[1]);
1634 vst1q_f32(fout_r, q2_in3.val[0]);
1636 vst1q_f32(fout_r, q2_in3.val[1]);
1702 ne10_radix2_r2c_c ( (CPLX*) fout_r, (
const CPLX*) fin);
1703 fout[0].
r = fout[0].
i;
1706 ne10_radix4_r2c_c ( (CPLX*) fout_r, (
const CPLX*) fin, 1, 1, 4);
1707 fout[0].
r = fout[0].
i;
1710 ne10_radix8_r2c_c ( (CPLX*) fout_r, (
const CPLX*) fin, 1, 1, 8);
1711 fout[0].
r = fout[0].
i;
1716 fout[cfg->nfft / 2].
r = fout[0].
i;
1719 fout[0].
i = fout[cfg->nfft / 2].
i = 0.0f;
1741 fin[0].
i = fin[0].
r;
1743 ne10_radix2_c2r_c ( (CPLX*) fout, (
const CPLX*) &fin[0].i);
1744 fin[0].
r = fin[0].
i;
1747 fin[0].
i = fin[0].
r;
1749 ne10_radix4_c2r_c ( (CPLX*) fout, (
const CPLX*) &fin[0].i, 1, 1, 4);
1750 fin[0].
r = fin[0].
i;
1753 fin[0].
i = fin[0].
r;
1755 ne10_radix8_c2r_c ( (CPLX*) fout, (
const CPLX*) &fin[0].i, 1, 1, 8);
1756 fin[0].
r = fin[0].
i;
1759 stage_count = cfg->r_factors_neon[0];
1760 radix = cfg->r_factors_neon[ stage_count << 1 ];
1765 fin[0].
i = fin[cfg->nfft>>1].
r;
1766 fout_c = (stage_count % 2==1) ? tmpbuf : (CPLX*)fout;
#define NE10_RADIX4x4_R2C_NEON_LOAD(PTR_IN, Q_IN, IN_STEP)
NE10_INLINE void ne10_radix4x4_c2r_with_twiddles_other_butterfly_neon(float32x4_t *Fout_neon, const float32x4_t *Fin_neon, const ne10_int32_t out_step, const ne10_int32_t in_step, const ne10_fft_cpx_float32_t *twiddles)
#define NE10_DECLARE_4(TYPE, NAME)
NE10_INLINE void ne10_radix4_r2c_with_twiddles_last_stage(ne10_fft_cpx_float32_t *dst, const ne10_fft_cpx_float32_t *src, const ne10_fft_cpx_float32_t *twiddles, const ne10_int32_t nfft)
#define NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S1(Q2_OUT, Q2_IN)
#define NE10_RADIX8x4_C2R_NEON_KERNEL(Q_OUT, Q_IN)
NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_last_butterfly_neon(float32x4_t *Fout_neon, const float32x4_t *Fin_neon, const ne10_int32_t out_step, const ne10_int32_t in_step, const ne10_fft_cpx_float32_t *twiddles)
NE10_INLINE void ne10_mixed_radix_r2c_butterfly_float32_neon(ne10_fft_cpx_float32_t *Fout, const ne10_fft_cpx_float32_t *Fin, const ne10_int32_t *factors, const ne10_fft_cpx_float32_t *twiddles, ne10_fft_cpx_float32_t *buffer)
NE10_INLINE void ne10_radix4x4_c2r_with_twiddles_last_butterfly_neon(float32x4_t *Fout_neon, const float32x4_t *Fin_neon, const ne10_int32_t out_step, const ne10_int32_t in_step, const ne10_fft_cpx_float32_t *twiddles)
#define ne10_swap_ptr(X, Y)
NE10_INLINE void ne10_radix4_r2c_with_twiddles_last_stage_other_butterfly(ne10_fft_cpx_float32_t *dst, const ne10_fft_cpx_float32_t *src, const ne10_fft_cpx_float32_t *twiddles, const ne10_int32_t nfft)
NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_first_butterfly_neon(float32x4_t *Fout_neon, const float32x4_t *Fin_neon, const ne10_int32_t out_step, const ne10_int32_t in_step, const ne10_fft_cpx_float32_t *twiddles)
#define NE10_RADIX8x4_R2C_NEON_LOAD(PTR_IN, Q_IN, IN_STEP)
#define NE10_CPX_MUL_INV_NEON_F32(Z, A, B)
#define NE10_RADIX4x4_R2C_TW_NEON_KERNEL_LAST(Q_OUT, Q_IN)
#define NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S2(Q2_OUT, Q2_IN)
NE10_INLINE void ne10_mixed_radix_c2r_butterfly_float32_neon(ne10_fft_cpx_float32_t *Fout, const ne10_fft_cpx_float32_t *Fin, const ne10_int32_t *factors, const ne10_fft_cpx_float32_t *twiddles, ne10_fft_cpx_float32_t *buffer)
#define NE10_RADIX4x4_R2C_TW_MUL_NEON(Q2_OUT, Q2_IN, Q2_TW)
NE10_INLINE void ne10_radix8x4_r2c_neon(ne10_fft_cpx_float32_t *Fout, const ne10_fft_cpx_float32_t *Fin, const ne10_int32_t fstride, const ne10_int32_t mstride, const ne10_int32_t nfft)
NE10_INLINE void ne10_radix8x4_c2r_neon(ne10_fft_cpx_float32_t *Fout, const ne10_fft_cpx_float32_t *Fin, const ne10_int32_t fstride, const ne10_int32_t mstride, const ne10_int32_t nfft)
#define NE10_FFT_R2C_CC_CC(OUT, IN)
#define NE10_FFT_C2R_RCR_4R(OUT, IN)
#define NE10_INV_BUTTERFLY_TMP(I1, I2, J1, J2, K1, K2, S1, S2)
#define NE10_RADIX8x4_R2C_NEON_STORE(PTR_OUT, Q_OUT, OUT_STEP)
NE10_INLINE void ne10_radix4x4_c2r_with_twiddles_neon(ne10_fft_cpx_float32_t *Fout, const ne10_fft_cpx_float32_t *Fin, const ne10_int32_t fstride, const ne10_int32_t mstride, const ne10_int32_t nfft, const ne10_fft_cpx_float32_t *twiddles)
#define NE10_DECLARE_3(TYPE, NAME)
NE10_INLINE void ne10_radix4x4_c2r_with_twiddles_first_butterfly_neon(float32x4_t *Fout_neon, const float32x4_t *Fin_neon, const ne10_int32_t out_step, const ne10_int32_t in_step, const ne10_fft_cpx_float32_t *twiddles)
NE10_INLINE void ne10_radix4_c2r_with_twiddles_first_stage(ne10_fft_cpx_float32_t *dst, const ne10_fft_cpx_float32_t *src, const ne10_fft_cpx_float32_t *twiddles, const ne10_int32_t nfft)
NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_other_butterfly_neon(float32x4_t *Fout_neon, const float32x4_t *Fin_neon, const ne10_int32_t out_step, const ne10_int32_t in_step, const ne10_fft_cpx_float32_t *twiddles)
#define NE10_RADIX4x4_C2R_TW_NEON_KERNEL(Q2_OUT, Q2_IN, Q2_TW)
NE10_INLINE void ne10_radix4x4_c2r_neon(ne10_fft_cpx_float32_t *Fout, const ne10_fft_cpx_float32_t *Fin, const ne10_int32_t fstride, const ne10_int32_t mstride, const ne10_int32_t nfft)
#define NE10_RADIX4X4C_TRANSPOSE_NEON(Q2_OUT, Q2_IN)
#define NE10_RADIX4x4_R2C_NEON_STORE(PTR_OUT, Q_OUT, OUT_STEP)
#define NE10_RADIX4x4_C2R_NEON_KERNEL(Q_OUT, Q_IN)
#define NE10_DECLARE_8(TYPE, NAME)
#define NE10_REVERSE_FLOAT32X4(VECTOR4F)
#define NE10_CPX_CONJ_MUL_F32(Z, A, B)
NE10_INLINE void ne10_radix4_c2r_with_twiddles_first_stage_first_butterfly(ne10_fft_cpx_float32_t *dst, const ne10_fft_cpx_float32_t *src, const ne10_fft_cpx_float32_t *twiddles, const ne10_int32_t nfft)
NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_neon(ne10_fft_cpx_float32_t *Fout, const ne10_fft_cpx_float32_t *Fin, const ne10_int32_t fstride, const ne10_int32_t mstride, const ne10_int32_t nfft, const ne10_fft_cpx_float32_t *twiddles)
#define NE10_FFT_R2C_4R_CC(OUT, IN)
#define NE10_RADIX8x4_R2C_NEON_KERNEL(Q_OUT, Q_IN)
void ne10_fft_c2r_1d_float32_neon(ne10_float32_t *fout, ne10_fft_cpx_float32_t *fin, ne10_fft_r2c_cfg_float32_t cfg)
Specific implementation of ne10_fft_c2r_1d_float32 using NEON SIMD capabilities.
NE10_INLINE void ne10_radix4x4_r2c_neon(ne10_fft_cpx_float32_t *Fout, const ne10_fft_cpx_float32_t *Fin, const ne10_int32_t fstride, const ne10_int32_t mstride, const ne10_int32_t nfft)
NE10_INLINE void ne10_radix4_r2c_with_twiddles_last_stage_second_butterfly(ne10_fft_cpx_float32_t *dst, const ne10_fft_cpx_float32_t *src, const ne10_fft_cpx_float32_t *twiddles, const ne10_int32_t nfft)
#define NE10_CPX_MUL_F32(Z, A, B)
ne10_fft_cpx_float32_t * buffer
NE10_INLINE void ne10_radix4_c2r_with_twiddles_first_stage_other_butterfly(ne10_fft_cpx_float32_t *dst, const ne10_fft_cpx_float32_t *src, const ne10_fft_cpx_float32_t *twiddles, const ne10_int32_t nfft)
#define NE10_PRINT_Q2_VECTOR(Q2_VECTOR)
#define NE10_FFT_C2R_CC_CC(OUT, IN)
#define NE10_FFT_R2C_4R_RCR(OUT, IN)
NE10_INLINE void ne10_radix4_c2r_with_twiddles_first_stage_second_butterfly(ne10_fft_cpx_float32_t *dst, const ne10_fft_cpx_float32_t *src, const ne10_fft_cpx_float32_t *twiddles, const ne10_int32_t nfft)
#define NE10_FFT_C2R_CC_4R(OUT, IN)
NE10_INLINE void ne10_radix4_r2c_with_twiddles_last_stage_first_butterfly(ne10_fft_cpx_float32_t *dst, const ne10_fft_cpx_float32_t *src, const ne10_fft_cpx_float32_t *twiddles, const ne10_int32_t nfft)
#define NE10_CPX_MUL_NEON_F32(Z, A, B)
#define NE10_RADIX4x4_C2R_TW_NEON_KERNEL_LAST(Q_OUT, Q_IN)
#define NE10_RADIX4x4_R2C_NEON_KERNEL(Q_OUT, Q_IN)
void ne10_fft_r2c_1d_float32_neon(ne10_fft_cpx_float32_t *fout, ne10_float32_t *fin, ne10_fft_r2c_cfg_float32_t cfg)
Specific implementation of ne10_fft_r2c_1d_float32 using NEON SIMD capabilities.