56 typedef float32x4x2_t
CPLX;
58 #define NE10_REAL_DUP_NEON_F32 vdupq_n_f32 59 #define NE10_CPLX_LOAD(PTR) vld2q_f32 ((ne10_float32_t*) (PTR)) 60 #define NE10_CPLX_STORE(PTR,OUT) \ 62 vst2q_f32 ((ne10_float32_t*) (PTR), OUT); \ 75 static inline REAL NE10_S_MUL_NEON_F32 (
const REAL vec,
79 REAL result = scalar_neon * vec;
83 static inline REAL NE10_S_MLA_NEON_F32 (
const REAL dst,
88 return vmlaq_f32 (dst, src, scalar_neon);
91 static inline REAL NE10_S_MLS_NEON_F32 (
const REAL dst,
96 return vmlsq_f32 (dst, src, scalar_neon);
102 static inline void NE10_FFT2_MUL_TW_NEON (
CPLX scratch_out[2],
103 const CPLX scratch_in[2],
104 const CPLX scratch_tw[1])
106 scratch_out[0] = scratch_in[0];
110 static inline void NE10_FFT3_MUL_TW_NEON (
CPLX scratch_out[3],
111 const CPLX scratch_in[3],
112 const CPLX scratch_tw[2])
114 NE10_FFT2_MUL_TW_NEON (scratch_out, scratch_in, scratch_tw);
118 static inline void NE10_FFT4_MUL_TW_NEON (
CPLX scratch_out[4],
119 const CPLX scratch_in[4],
120 const CPLX scratch_tw[3])
122 NE10_FFT3_MUL_TW_NEON (scratch_out, scratch_in, scratch_tw);
126 static inline void NE10_FFT5_MUL_TW_NEON (
CPLX scratch_out[5],
127 const CPLX scratch_in[5],
128 const CPLX scratch_tw[4])
130 NE10_FFT4_MUL_TW_NEON (scratch_out, scratch_in, scratch_tw);
137 static inline void NE10_FFT2_CONJ (
CPLX scratch_out[2])
139 scratch_out[0].val[1] = -scratch_out[0].val[1];
140 scratch_out[1].val[1] = -scratch_out[1].val[1];
143 static inline void NE10_FFT3_CONJ (
CPLX scratch_out[3])
145 NE10_FFT2_CONJ (scratch_out);
146 scratch_out[2].val[1] = -scratch_out[2].val[1];
149 static inline void NE10_FFT4_CONJ (
CPLX scratch_out[4])
151 NE10_FFT3_CONJ (scratch_out);
152 scratch_out[3].val[1] = -scratch_out[3].val[1];
155 static inline void NE10_FFT5_CONJ (
CPLX scratch_out[5])
157 NE10_FFT4_CONJ (scratch_out);
158 scratch_out[4].val[1] = -scratch_out[4].val[1];
161 static inline void NE10_FFT8_CONJ (
CPLX scratch_out[8])
163 NE10_FFT5_CONJ (scratch_out);
164 scratch_out[5].val[1] = -scratch_out[5].val[1];
165 scratch_out[6].val[1] = -scratch_out[6].val[1];
166 scratch_out[7].val[1] = -scratch_out[7].val[1];
173 static inline void NE10_FFT2_SCALING (
CPLX scratch_out[2],
174 const REAL one_by_fft_neon)
176 #ifdef NE10_DSP_CFFT_SCALING 177 scratch_out[0].val[0] *= one_by_fft_neon;
178 scratch_out[0].val[1] *= one_by_fft_neon;
179 scratch_out[1].val[0] *= one_by_fft_neon;
180 scratch_out[1].val[1] *= one_by_fft_neon;
184 static inline void NE10_FFT3_SCALING (
CPLX scratch_out[3],
185 const REAL one_by_fft_neon)
187 #ifdef NE10_DSP_CFFT_SCALING 188 NE10_FFT2_SCALING (scratch_out, one_by_fft_neon);
189 scratch_out[2].val[0] *= one_by_fft_neon;
190 scratch_out[2].val[1] *= one_by_fft_neon;
194 static inline void NE10_FFT4_SCALING (
CPLX scratch_out[4],
195 const REAL one_by_fft_neon)
197 #ifdef NE10_DSP_CFFT_SCALING 198 NE10_FFT3_SCALING (scratch_out, one_by_fft_neon);
199 scratch_out[3].val[0] *= one_by_fft_neon;
200 scratch_out[3].val[1] *= one_by_fft_neon;
204 static inline void NE10_FFT5_SCALING (
CPLX scratch_out[5],
205 const REAL one_by_fft_neon)
207 #ifdef NE10_DSP_CFFT_SCALING 208 NE10_FFT4_SCALING (scratch_out, one_by_fft_neon);
209 scratch_out[4].val[0] *= one_by_fft_neon;
210 scratch_out[4].val[1] *= one_by_fft_neon;
214 static inline void NE10_FFT8_SCALING (
CPLX scratch_out[8],
215 const REAL one_by_fft_neon)
217 #ifdef NE10_DSP_CFFT_SCALING 218 NE10_FFT5_SCALING (scratch_out, one_by_fft_neon);
219 scratch_out[5].val[0] *= one_by_fft_neon;
220 scratch_out[5].val[1] *= one_by_fft_neon;
221 scratch_out[6].val[0] *= one_by_fft_neon;
222 scratch_out[6].val[1] *= one_by_fft_neon;
223 scratch_out[7].val[0] *= one_by_fft_neon;
224 scratch_out[7].val[1] *= one_by_fft_neon;
234 static inline void NE10_FFT2_FUC_NEON_F32 (
CPLX scratch_out[2],
235 const CPLX scratch_in[2])
241 static inline void NE10_FFT3_FUC_NEON_F32 (
CPLX Fout[3],
244 const float32x4_t TW_3IN_NEON_F32 = vdupq_n_f32 (TW_3IN_F32);
245 const float32x4_t HALF_NEON_F32 = vdupq_n_f32 (0.5f);
250 Fout[1].val[0] = Fin[0].val[0] - Fout[2].val[0] * HALF_NEON_F32;
251 Fout[1].val[1] = Fin[0].val[1] - Fout[2].val[1] * HALF_NEON_F32;
253 Fout[0].val[0] = Fout[0].val[0] * TW_3IN_NEON_F32;
254 Fout[0].val[1] = Fout[0].val[1] * TW_3IN_NEON_F32;
257 static inline void NE10_FFT4_FUC_NEON_F32 (
CPLX scratch_out[4],
258 const CPLX scratch_in[4])
270 scratch_out[1].val[0] = scratch[1].val[0] + scratch[3].val[1];
271 scratch_out[1].val[1] = scratch[1].val[1] - scratch[3].val[0];
272 scratch_out[3].val[0] = scratch[1].val[0] - scratch[3].val[1];
273 scratch_out[3].val[1] = scratch[1].val[1] + scratch[3].val[0];
276 static inline void NE10_FFT4_FUC_INPLACE_NEON_F32 (
CPLX scratch_out[4])
288 scratch_out[1].val[0] = scratch[1].val[0] + scratch[3].val[1];
289 scratch_out[1].val[1] = scratch[1].val[1] - scratch[3].val[0];
290 scratch_out[3].val[0] = scratch[1].val[0] - scratch[3].val[1];
291 scratch_out[3].val[1] = scratch[1].val[1] + scratch[3].val[0];
294 static inline void NE10_FFT5_FUC_INPLACE_NEON_F32 (
CPLX Fout[5])
304 Fout[0].val[0] = Fout[0].val[0] + s[1].val[0] + s[2].val[0];
305 Fout[0].val[1] = Fout[0].val[1] + s[1].val[1] + s[2].val[1];
307 s[0].val[0] = NE10_S_MLA_NEON_F32 (s[0].val[0], s[1].val[0], TW_5A_F32.r);
308 s[0].val[1] = NE10_S_MLA_NEON_F32 (s[0].val[1], s[1].val[1], TW_5A_F32.r);
309 s[5].val[0] = NE10_S_MLA_NEON_F32 (s[5].val[0], s[1].val[0], TW_5B_F32.r);
310 s[5].val[1] = NE10_S_MLA_NEON_F32 (s[5].val[1], s[1].val[1], TW_5B_F32.r);
312 s[0].val[0] = NE10_S_MLA_NEON_F32 (s[0].val[0], s[2].val[0], TW_5B_F32.r);
313 s[0].val[1] = NE10_S_MLA_NEON_F32 (s[0].val[1], s[2].val[1], TW_5B_F32.r);
314 s[5].val[0] = NE10_S_MLA_NEON_F32 (s[5].val[0], s[2].val[0], TW_5A_F32.r);
315 s[5].val[1] = NE10_S_MLA_NEON_F32 (s[5].val[1], s[2].val[1], TW_5A_F32.r);
320 s[1].val[0] = NE10_S_MUL_NEON_F32 (s[4].val[1], TW_5A_F32.i);
321 s[1].val[1] = -NE10_S_MUL_NEON_F32 (s[4].val[0], TW_5A_F32.i);
322 s[2].val[0] = -NE10_S_MUL_NEON_F32 (s[4].val[1], TW_5B_F32.i);
323 s[2].val[1] = NE10_S_MUL_NEON_F32 (s[4].val[0], TW_5B_F32.i);
325 s[1].val[0] = NE10_S_MLA_NEON_F32 (s[1].val[0], s[3].val[1], TW_5B_F32.i);
326 s[1].val[1] = NE10_S_MLS_NEON_F32 (s[1].val[1], s[3].val[0], TW_5B_F32.i);
327 s[2].val[0] = NE10_S_MLA_NEON_F32 (s[2].val[0], s[3].val[1], TW_5A_F32.i);
328 s[2].val[1] = NE10_S_MLS_NEON_F32 (s[2].val[1], s[3].val[0], TW_5A_F32.i);
336 #define NE10_BUTTERFLY_INDEX_NEON_F32(OUT,IN,OUT_I,OUT_J,IN_I,IN_J) \ 338 NE10_CPX_ADD_NEON_F32 (OUT[OUT_I],IN[IN_I],IN[IN_J]); \ 339 NE10_CPX_SUB_NEON_F32 (OUT[OUT_J],IN[IN_I],IN[IN_J]); \ 342 static inline void NE10_FFT8_FUC_NEON_F32 (
CPLX out[8],
348 { 1.00000, 0.00000 },
349 { 0.70711, -0.70711 },
350 { 0.00000, -1.00000 },
351 { -0.70711, -0.70711 },
367 #define NE10_CPX_MUL_TW8_NEON_F32(OUT,TW_8_TABLE,OUT_I,TW_J) \ 369 ne10_fft_cpx_float32_t TW_TMP = TW_8_TABLE[TW_J]; \ 371 TW_TMP_NEON.val[0] = NE10_REAL_DUP_NEON_F32 (TW_TMP.r); \ 372 TW_TMP_NEON.val[1] = NE10_REAL_DUP_NEON_F32 (TW_TMP.i); \ 373 NE10_CPX_MUL_NEON_F32 (OUT[OUT_I],OUT[OUT_I],TW_TMP_NEON); \ 394 #undef NE10_CPX_MUL_TW8_NEON_F32 415 template<ne10_
int32_t is_first_stage, ne10_
int32_t is_inverse,
bool is_scaled>
416 static void ne10_radix_2_butterfly_float32_neon (
CPLX *Fout,
432 for (f_count = fstride; f_count > 0; f_count--)
434 for (m_count = out_step; m_count > 0; m_count--)
436 #ifndef NE10_INLINE_ASM_OPT 445 if (is_first_stage == 0)
450 NE10_FFT2_FUC_NEON_F32 (out, in);
454 NE10_FFT2_CONJ (out);
458 NE10_FFT2_SCALING (out, one_by_fft_neon);
464 #else // NE10_INLINE_ASM_OPT 466 #error Currently, inline assembly optimizations are only available on AArch64. 469 "ld2 {v0.4s, v1.4s}, [%[pin0]] \n\t" 470 "ld2 {v2.4s, v3.4s}, [%[pin1]] \n\t" 473 [pin1]
"r"(Fin + in_step)
474 :
"memory",
"v0",
"v1",
"v2",
"v3");
479 "fneg v1.4s, v1.4s \n\t" 480 "fneg v3.4s, v3.4s \n\t" 483 :
"v0",
"v1",
"v2",
"v3");
486 if (is_first_stage == 0)
489 "ld1 {v4.1d}, [%[ptw]] \n\t" 491 "fmul v14.4s, v2.4s, v4.s[1] \n\t" 492 "fmul v2.4s, v2.4s, v4.s[0] \n\t" 493 "fmls v2.4s, v3.4s, v4.s[1] \n\t" 494 "fmul v15.4s, v3.4s, v4.s[0] \n\t" 495 "fadd v3.4s, v14.4s, v15.4s \n\t" 498 :
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v14",
"v15");
502 "fadd v4.4s, v0.4s, v2.4s \n\t" 503 "fadd v5.4s, v1.4s, v3.4s \n\t" 504 "fsub v6.4s, v0.4s, v2.4s \n\t" 505 "fsub v7.4s, v1.4s, v3.4s \n\t" 509 "v0",
"v1",
"v2",
"v3",
510 "v4",
"v5",
"v6",
"v7");
515 "fneg v5.4s, v5.4s \n\t" 516 "fneg v7.4s, v7.4s \n\t" 519 :
"v4",
"v5",
"v6",
"v7");
525 "fmul v4.4s, v4.4s, %[one_by_nfft].4s \n\t" 526 "fmul v5.4s, v5.4s, %[one_by_nfft].4s \n\t" 527 "fmul v6.4s, v6.4s, %[one_by_nfft].4s \n\t" 528 "fmul v7.4s, v7.4s, %[one_by_nfft].4s \n\t" 530 : [one_by_nfft]
"w"(one_by_fft_neon)
531 :
"v4",
"v5",
"v6",
"v7");
535 "st2 {v4.4s, v5.4s}, [%[pout0]] \n\t" 536 "st2 {v6.4s, v7.4s}, [%[pout1]] \n\t" 539 [pout1]
"r"(Fout + out_step)
540 :
"memory",
"v4",
"v5",
"v6",
"v7");
541 #endif // __aarch64__ 542 #endif // NE10_INLINE_ASM_OPT 546 if (is_first_stage == 0)
556 if (is_first_stage == 0)
558 twiddles -= out_step;
559 Fout += (2 - 1) * out_step;
563 template<ne10_
int32_t is_first_stage, ne10_
int32_t is_inverse,
bool is_scaled>
564 static void ne10_radix_4_butterfly_float32_neon (
CPLX *Fout,
572 #ifdef NE10_INLINE_ASM_OPT 582 for (f_count = fstride; f_count > 0; f_count--)
584 for (m_count = out_step; m_count > 0; m_count--)
586 #ifndef NE10_INLINE_ASM_OPT 597 if (is_first_stage == 0)
604 NE10_FFT4_FUC_INPLACE_NEON_F32 (in);
612 NE10_FFT4_SCALING (in, one_by_fft_neon);
619 #else // NE10_INLINE_ASM_OPT 621 #error Currently, inline assembly optimizations are only available on AArch64. 623 #define NEON_REGISTERS_C2C_R4 \ 628 #define NEON_REGISTERS_C2C_TW_R4 \ 634 "ld2 {v0.4s, v1.4s}, [%[pin0]] \n\t" 635 "ld2 {v2.4s, v3.4s}, [%[pin1]] \n\t" 636 "ld2 {v4.4s, v5.4s}, [%[pin2]] \n\t" 637 "ld2 {v6.4s, v7.4s}, [%[pin3]] \n\t" 640 [pin1]
"r"(Fin + in_step),
641 [pin2]
"r"(Fin + in_step * 2),
642 [pin3]
"r"(Fin + in_step * 3)
643 :
"memory", NEON_REGISTERS_C2C_R4);
648 "fneg v1.4s, v1.4s \n\t" 649 "fneg v3.4s, v3.4s \n\t" 650 "fneg v5.4s, v5.4s \n\t" 651 "fneg v7.4s, v7.4s \n\t" 654 : NEON_REGISTERS_C2C_R4);
657 if (is_first_stage == 0)
660 "ld1 { v8.1d}, [%[ptw0]] \n\t" 661 "ld1 { v9.1d}, [%[ptw1]] \n\t" 662 "ld1 {v10.1d}, [%[ptw2]] \n\t" 664 "fmul v14.4s, v2.4s, v8.s[1] \n\t" 665 "fmul v2.4s, v2.4s, v8.s[0] \n\t" 666 "fmls v2.4s, v3.4s, v8.s[1] \n\t" 667 "fmul v15.4s, v3.4s, v8.s[0] \n\t" 668 "fadd v3.4s, v14.4s, v15.4s \n\t" 670 "fmul v14.4s, v4.4s, v9.s[1] \n\t" 671 "fmul v4.4s, v4.4s, v9.s[0] \n\t" 672 "fmls v4.4s, v5.4s, v9.s[1] \n\t" 673 "fmul v15.4s, v5.4s, v9.s[0] \n\t" 674 "fadd v5.4s, v14.4s, v15.4s \n\t" 676 "fmul v14.4s, v6.4s, v10.s[1] \n\t" 677 "fmul v6.4s, v6.4s, v10.s[0] \n\t" 678 "fmls v6.4s, v7.4s, v10.s[1] \n\t" 679 "fmul v15.4s, v7.4s, v10.s[0] \n\t" 680 "fadd v7.4s, v14.4s, v15.4s \n\t" 682 : [ptw0]
"r"(twiddles),
683 [ptw1]
"r"(twiddles + out_step),
684 [ptw2]
"r"(twiddles + out_step * 2)
685 :
"memory", NEON_REGISTERS_C2C_R4,
686 NEON_REGISTERS_C2C_TW_R4,
"v14",
"v15");
690 "fadd %[s0r].4s, v0.4s, v4.4s \n\t" 691 "fadd %[s0i].4s, v1.4s, v5.4s \n\t" 693 "fsub %[s1r].4s, v0.4s, v4.4s \n\t" 694 "fsub %[s1i].4s, v1.4s, v5.4s \n\t" 696 "fadd %[s2r].4s, v2.4s, v6.4s \n\t" 697 "fadd %[s2i].4s, v3.4s, v7.4s \n\t" 699 "fsub %[s3r].4s, v2.4s, v6.4s \n\t" 700 "fsub %[s3i].4s, v3.4s, v7.4s \n\t" 701 : [s0r]
"+w"(s[0].val[0]),
702 [s0i]
"+w"(s[0].val[1]),
703 [s1r]
"+w"(s[1].val[0]),
704 [s1i]
"+w"(s[1].val[1]),
705 [s2r]
"+w"(s[2].val[0]),
706 [s2i]
"+w"(s[2].val[1]),
707 [s3r]
"+w"(s[3].val[0]),
708 [s3i]
"+w"(s[3].val[1])
710 : NEON_REGISTERS_C2C_R4);
713 "fadd v0.4s, %[s0r].4s, %[s2r].4s \n\t" 714 "fadd v1.4s, %[s0i].4s, %[s2i].4s \n\t" 715 "fsub v4.4s, %[s0r].4s, %[s2r].4s \n\t" 716 "fsub v5.4s, %[s0i].4s, %[s2i].4s \n\t" 718 "fadd v2.4s, %[s1r].4s, %[s3i].4s \n\t" 719 "fsub v3.4s, %[s1i].4s, %[s3r].4s \n\t" 720 "fsub v6.4s, %[s1r].4s, %[s3i].4s \n\t" 721 "fadd v7.4s, %[s1i].4s, %[s3r].4s \n\t" 723 : [s0r]
"w"(s[0].val[0]),
724 [s0i]
"w"(s[0].val[1]),
725 [s1r]
"w"(s[1].val[0]),
726 [s1i]
"w"(s[1].val[1]),
727 [s2r]
"w"(s[2].val[0]),
728 [s2i]
"w"(s[2].val[1]),
729 [s3r]
"w"(s[3].val[0]),
730 [s3i]
"w"(s[3].val[1])
731 : NEON_REGISTERS_C2C_R4);
736 "fneg v1.4s, v1.4s \n\t" 737 "fneg v3.4s, v3.4s \n\t" 738 "fneg v5.4s, v5.4s \n\t" 739 "fneg v7.4s, v7.4s \n\t" 742 : NEON_REGISTERS_C2C_R4);
748 "fmul v0.4s, v0.4s, %[one_by_nfft].4s \n\t" 749 "fmul v1.4s, v1.4s, %[one_by_nfft].4s \n\t" 750 "fmul v2.4s, v2.4s, %[one_by_nfft].4s \n\t" 751 "fmul v3.4s, v3.4s, %[one_by_nfft].4s \n\t" 752 "fmul v4.4s, v4.4s, %[one_by_nfft].4s \n\t" 753 "fmul v5.4s, v5.4s, %[one_by_nfft].4s \n\t" 754 "fmul v6.4s, v6.4s, %[one_by_nfft].4s \n\t" 755 "fmul v7.4s, v7.4s, %[one_by_nfft].4s \n\t" 757 : [one_by_nfft]
"w"(one_by_fft_neon)
758 : NEON_REGISTERS_C2C_R4);
762 "st2 {v0.4s, v1.4s}, [%[pout0]] \n\t" 763 "st2 {v2.4s, v3.4s}, [%[pout1]] \n\t" 764 "st2 {v4.4s, v5.4s}, [%[pout2]] \n\t" 765 "st2 {v6.4s, v7.4s}, [%[pout3]] \n\t" 768 [pout1]
"r"(Fout + out_step),
769 [pout2]
"r"(Fout + out_step * 2),
770 [pout3]
"r"(Fout + out_step * 3)
771 : NEON_REGISTERS_C2C_R4);
772 #endif // __aarch64__ 773 #endif // NE10_INLINE_ASM_OPT 776 if (is_first_stage == 0)
786 if (is_first_stage == 0)
788 twiddles -= out_step;
789 Fout += (4 - 1) * out_step;
794 template<ne10_
int32_t is_first_stage, ne10_
int32_t is_inverse,
bool is_scaled>
795 static void ne10_radix_3_butterfly_float32_neon (
CPLX *Fout,
811 const float32x4_t TW_3IN_NEON_F32 = vdupq_n_f32 (TW_3IN_F32);
812 const float32x4_t HALF_NEON_F32 = vdupq_n_f32 (0.5f);
814 for (f_count = fstride; f_count > 0; f_count--)
816 for (m_count = out_step; m_count > 0; m_count--)
818 #ifndef NE10_INLINE_ASM_OPT 828 if (is_first_stage == 0)
838 s[1].val[0] = - s[2].val[0] * HALF_NEON_F32;
839 s[1].val[1] = - s[2].val[1] * HALF_NEON_F32;
841 s[1].val[0] += s[3].val[0];
842 s[1].val[1] += s[3].val[1];
843 s[0].val[0] *= TW_3IN_NEON_F32;
844 s[0].val[1] *= TW_3IN_NEON_F32;
846 out[0].val[0] = s[3].val[0] + s[2].val[0];
847 out[0].val[1] = s[3].val[1] + s[2].val[1];
848 out[1].val[0] = s[1].val[0] - s[0].val[1];
849 out[1].val[1] = s[1].val[1] + s[0].val[0];
850 out[2].val[0] = s[1].val[0] + s[0].val[1];
851 out[2].val[1] = s[1].val[1] - s[0].val[0];
855 NE10_FFT3_CONJ (out);
859 NE10_FFT3_SCALING (out, one_by_fft_neon);
865 #else // NE10_INLINE_ASM_OPT 867 #error Currently, inline assembly optimizations are only available on AArch64. 870 "ld2 {v0.4s, v1.4s}, [%[pin0]] \n\t" 871 "ld2 {v2.4s, v3.4s}, [%[pin1]] \n\t" 872 "ld2 {v4.4s, v5.4s}, [%[pin2]] \n\t" 875 [pin1]
"r"(Fin + in_step),
876 [pin2]
"r"(Fin + in_step * 2)
877 :
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5");
882 "fneg v1.4s, v1.4s \n\t" 883 "fneg v3.4s, v3.4s \n\t" 884 "fneg v5.4s, v5.4s \n\t" 890 if (is_first_stage == 0)
894 "ld1 {v6.1d}, [%[ptw0]] \n\t" 895 "ld1 {v7.1d}, [%[ptw1]] \n\t" 897 "fmul v10.4s, v2.4s, v6.s[1] \n\t" 898 "fmul v2.4s, v2.4s, v6.s[0] \n\t" 899 "fmls v2.4s, v3.4s, v6.s[1] \n\t" 900 "fmul v11.4s, v3.4s, v6.s[0] \n\t" 901 "fadd v3.4s, v10.4s, v11.4s \n\t" 903 "fmul v10.4s, v4.4s, v7.s[1] \n\t" 904 "fmul v4.4s, v4.4s, v7.s[0] \n\t" 905 "fmls v4.4s, v5.4s, v7.s[1] \n\t" 906 "fmul v11.4s, v5.4s, v7.s[0] \n\t" 907 "fadd v5.4s, v10.4s, v11.4s \n\t" 909 : [ptw0]
"r"(twiddles),
910 [ptw1]
"r"(twiddles + out_step)
911 :
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
912 "v6",
"v7",
"v8",
"v9",
917 "fadd %[s2r].4s, v2.4s, v4.4s \n\t" 918 "fadd %[s2i].4s, v3.4s, v5.4s \n\t" 920 "fsub %[s0r].4s, v2.4s, v4.4s \n\t" 921 "fsub %[s0i].4s, v3.4s, v5.4s \n\t" 923 "mov %[s3r].16b, v0.16b \n\t" 924 "mov %[s3i].16b, v1.16b \n\t" 925 : [s0r]
"+w"(s[0].val[0]),
926 [s0i]
"+w"(s[0].val[1]),
927 [s2r]
"+w"(s[2].val[0]),
928 [s2i]
"+w"(s[2].val[1]),
929 [s3r]
"+w"(s[3].val[0]),
930 [s3i]
"+w"(s[3].val[1])
932 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5");
934 s[1].val[0] = - s[2].val[0] * HALF_NEON_F32;
935 s[1].val[1] = - s[2].val[1] * HALF_NEON_F32;
937 s[1].val[0] += s[3].val[0];
938 s[1].val[1] += s[3].val[1];
939 s[0].val[0] *= TW_3IN_NEON_F32;
940 s[0].val[1] *= TW_3IN_NEON_F32;
946 "fadd v0.4s, %[s3r].4s, %[s2r].4s \n\t" 947 "fadd v1.4s, %[s3i].4s, %[s2i].4s \n\t" 948 "fsub v2.4s, %[s1r].4s, %[s0i].4s \n\t" 949 "fadd v3.4s, %[s1i].4s, %[s0r].4s \n\t" 950 "fadd v4.4s, %[s1r].4s, %[s0i].4s \n\t" 951 "fsub v5.4s, %[s1i].4s, %[s0r].4s \n\t" 953 : [s0r]
"w"(s[0].val[0]),
954 [s0i]
"w"(s[0].val[1]),
955 [s1r]
"w"(s[1].val[0]),
956 [s1i]
"w"(s[1].val[1]),
957 [s2r]
"w"(s[2].val[0]),
958 [s2i]
"w"(s[2].val[1]),
959 [s3r]
"w"(s[3].val[0]),
960 [s3i]
"w"(s[3].val[1])
961 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5");
966 "fneg v1.4s, v1.4s \n\t" 967 "fneg v3.4s, v3.4s \n\t" 968 "fneg v5.4s, v5.4s \n\t" 977 "fmul v0.4s, v0.4s, %[one_by_nfft].4s \n\t" 978 "fmul v1.4s, v1.4s, %[one_by_nfft].4s \n\t" 979 "fmul v2.4s, v2.4s, %[one_by_nfft].4s \n\t" 980 "fmul v3.4s, v3.4s, %[one_by_nfft].4s \n\t" 981 "fmul v4.4s, v4.4s, %[one_by_nfft].4s \n\t" 982 "fmul v5.4s, v5.4s, %[one_by_nfft].4s \n\t" 984 : [one_by_nfft]
"w"(one_by_fft_neon)
985 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5");
989 "st2 {v0.4s, v1.4s}, [%[pout0]] \n\t" 990 "st2 {v2.4s, v3.4s}, [%[pout1]] \n\t" 991 "st2 {v4.4s, v5.4s}, [%[pout2]] \n\t" 994 [pout1]
"r"(Fout + out_step),
995 [pout2]
"r"(Fout + 2 * out_step)
996 :
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5");
997 #endif // __aarch64__ 998 #endif // NE10_INLINE_ASM_OPT 1002 if (is_first_stage == 0)
1012 if (is_first_stage == 0)
1014 twiddles -= out_step;
1015 Fout += (3 - 1) * out_step;
1020 template<ne10_
int32_t is_first_stage, ne10_
int32_t is_inverse,
bool is_scaled>
1021 static void ne10_radix_5_butterfly_float32_neon (
CPLX *Fout,
1037 for (f_count = fstride; f_count > 0; f_count--)
1039 for (m_count = out_step; m_count > 0; m_count--)
1047 if (is_inverse == 1)
1049 NE10_FFT5_CONJ (in);
1052 if (is_first_stage == 0)
1066 in[0].val[0] = in[0].val[0] + s[1].val[0] + s[2].val[0];
1067 in[0].val[1] = in[0].val[1] + s[1].val[1] + s[2].val[1];
1069 s[0].val[0] = NE10_S_MLA_NEON_F32 (s[0].val[0], s[1].val[0], TW_5A_F32.r);
1070 s[0].val[1] = NE10_S_MLA_NEON_F32 (s[0].val[1], s[1].val[1], TW_5A_F32.r);
1071 s[5].val[0] = NE10_S_MLA_NEON_F32 (s[5].val[0], s[1].val[0], TW_5B_F32.r);
1072 s[5].val[1] = NE10_S_MLA_NEON_F32 (s[5].val[1], s[1].val[1], TW_5B_F32.r);
1074 s[0].val[0] = NE10_S_MLA_NEON_F32 (s[0].val[0], s[2].val[0], TW_5B_F32.r);
1075 s[0].val[1] = NE10_S_MLA_NEON_F32 (s[0].val[1], s[2].val[1], TW_5B_F32.r);
1076 s[5].val[0] = NE10_S_MLA_NEON_F32 (s[5].val[0], s[2].val[0], TW_5A_F32.r);
1077 s[5].val[1] = NE10_S_MLA_NEON_F32 (s[5].val[1], s[2].val[1], TW_5A_F32.r);
1082 s[1].val[0] = NE10_S_MUL_NEON_F32 (s[4].val[1], TW_5A_F32.i);
1083 s[1].val[1] = -NE10_S_MUL_NEON_F32 (s[4].val[0], TW_5A_F32.i);
1084 s[2].val[0] = -NE10_S_MUL_NEON_F32 (s[4].val[1], TW_5B_F32.i);
1085 s[2].val[1] = NE10_S_MUL_NEON_F32 (s[4].val[0], TW_5B_F32.i);
1087 s[1].val[0] = NE10_S_MLA_NEON_F32 (s[1].val[0], s[3].val[1], TW_5B_F32.i);
1088 s[1].val[1] = NE10_S_MLS_NEON_F32 (s[1].val[1], s[3].val[0], TW_5B_F32.i);
1089 s[2].val[0] = NE10_S_MLA_NEON_F32 (s[2].val[0], s[3].val[1], TW_5A_F32.i);
1090 s[2].val[1] = NE10_S_MLS_NEON_F32 (s[2].val[1], s[3].val[0], TW_5A_F32.i);
1097 if (is_inverse == 1)
1099 NE10_FFT5_CONJ (in);
1103 NE10_FFT5_SCALING (in, one_by_fft_neon);
1114 if (is_first_stage == 0)
1124 if (is_first_stage == 0)
1126 twiddles -= out_step;
1127 Fout += (5 - 1) * out_step;
1132 template<ne10_
int32_t is_first_stage, ne10_
int32_t is_inverse,
bool is_scaled>
1133 static void ne10_radix_8_butterfly_float32_neon (
CPLX *Fout,
1149 for (f_count = fstride; f_count > 0; f_count--)
1151 for (m_count = out_step; m_count > 0; m_count--)
1162 if (is_inverse == 1)
1164 NE10_FFT8_CONJ (in);
1167 NE10_FFT8_FUC_NEON_F32 (out, in);
1169 if (is_inverse == 1)
1171 NE10_FFT8_CONJ (out);
1175 NE10_FFT8_SCALING (out, one_by_fft_neon);
1193 template<ne10_
int32_t is_inverse,
bool is_scaled>
1194 static void ne10_mixed_radix_generic_butterfly_float32_neon_impl (
CPLX *Fout,
1205 stage_count = factors[0];
1206 fstride = factors[1];
1208 radix = factors[ stage_count << 1 ];
1209 nfft = fstride * radix;
1212 if (stage_count % 2 == 0)
1221 ne10_radix_2_butterfly_float32_neon<1, is_inverse, is_scaled> (Fout, Fin, NULL,
1225 ne10_radix_4_butterfly_float32_neon<1, is_inverse, is_scaled> (Fout, Fin, NULL,
1229 ne10_radix_3_butterfly_float32_neon<1, is_inverse, is_scaled> (Fout, Fin, NULL,
1233 ne10_radix_5_butterfly_float32_neon<1, is_inverse, is_scaled> (Fout, Fin, NULL,
1237 ne10_radix_8_butterfly_float32_neon<1, is_inverse, is_scaled> (Fout, Fin, NULL,
1255 radix = factors[ stage_count << 1 ];
1258 while (stage_count > 0)
1262 assert ((radix > 1) && (radix < 6));
1270 ne10_radix_2_butterfly_float32_neon<0, is_inverse, false> (Fout, buffer,
1271 twiddles, fstride, mstride, nfft);
1274 ne10_radix_3_butterfly_float32_neon<0, is_inverse, false> (Fout, buffer,
1275 twiddles, fstride, mstride, nfft);
1278 ne10_radix_4_butterfly_float32_neon<0, is_inverse, false> (Fout, buffer,
1279 twiddles, fstride, mstride, nfft);
1282 ne10_radix_5_butterfly_float32_neon<0, is_inverse, false> (Fout, buffer,
1283 twiddles, fstride, mstride, nfft);
1287 twiddles += mstride * (radix - 1);
1291 radix = factors[ stage_count << 1 ];
1295 template<ne10_
int32_t is_inverse>
1296 static void ne10_c2c_1d_last_stage_neon (
CPLX *Fout,
1306 for (f_count = fstride; f_count > 0; f_count--)
1309 CPLX scratch_out[4];
1314 #ifndef NE10_INLINE_ASM_OPT 1322 CPLX scratch0, scratch_in0;
1323 CPLX scratch1, scratch_in1;
1324 CPLX scratch2, scratch_in2;
1325 CPLX scratch3, scratch_in3;
1327 scratch_in0 = scratch_in[0];
1328 scratch_in1 = scratch_in[1];
1329 scratch_in2 = scratch_in[2];
1330 scratch_in3 = scratch_in[3];
1334 scratch_in[0] = scratch0;
1335 scratch_in[1] = scratch1;
1336 scratch_in[2] = scratch2;
1337 scratch_in[3] = scratch3;
1339 #else // NE10_INLINE_ASM_OPT 1341 #error Currently, inline assembly optimizations are only available on AArch64. 1342 #else // __aarch64__ 1343 const float *pin = (
const float *) Fin;
1345 "ld2 {v0.4s, v1.4s}, [%[pin]], %[offset] \n\t" 1346 "ld2 {v2.4s, v3.4s}, [%[pin]], %[offset] \n\t" 1347 "ld2 {v4.4s, v5.4s}, [%[pin]], %[offset] \n\t" 1348 "ld2 {v6.4s, v7.4s}, [%[pin]] \n\t" 1351 "trn1 v8.4s, v0.4s, v2.4s \n\t" 1352 "trn2 v9.4s, v0.4s, v2.4s \n\t" 1353 "trn1 v10.4s, v4.4s, v6.4s \n\t" 1354 "trn2 v11.4s, v4.4s, v6.4s \n\t" 1356 "trn1 %[in0r].2d, v8.2d, v10.2d \n\t" 1357 "trn1 %[in1r].2d, v9.2d, v11.2d \n\t" 1358 "trn2 %[in2r].2d, v8.2d, v10.2d \n\t" 1359 "trn2 %[in3r].2d, v9.2d, v11.2d \n\t" 1361 "trn1 v8.4s, v1.4s, v3.4s \n\t" 1362 "trn2 v9.4s, v1.4s, v3.4s \n\t" 1363 "trn1 v10.4s, v5.4s, v7.4s \n\t" 1364 "trn2 v11.4s, v5.4s, v7.4s \n\t" 1366 "trn1 %[in0i].2d, v8.2d, v10.2d \n\t" 1367 "trn1 %[in1i].2d, v9.2d, v11.2d \n\t" 1368 "trn2 %[in2i].2d, v8.2d, v10.2d \n\t" 1369 "trn2 %[in3i].2d, v9.2d, v11.2d \n\t" 1370 : [in0r]
"+w"(scratch_in[0].val[0]),
1371 [in0i]
"+w"(scratch_in[0].val[1]),
1372 [in1r]
"+w"(scratch_in[1].val[0]),
1373 [in1i]
"+w"(scratch_in[1].val[1]),
1374 [in2r]
"+w"(scratch_in[2].val[0]),
1375 [in2i]
"+w"(scratch_in[2].val[1]),
1376 [in3r]
"+w"(scratch_in[3].val[0]),
1377 [in3i]
"+w"(scratch_in[3].val[1]),
1380 :
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
1381 "v8",
"v9",
"v10",
"v11");
1382 #endif // __aarch64__ 1383 #endif // NE10_INLINE_ASM_OPT 1387 NE10_FFT4_CONJ (scratch_in);
1392 #ifndef NE10_INLINE_ASM_OPT 1399 NE10_FFT4_MUL_TW_NEON (scratch_in, scratch_in, scratch_tw);
1400 #else // NE10_INLINE_ASM_OPT 1402 #error Currently, inline assembly optimizations are only available on AArch64. 1403 #else // __aarch64__ 1404 const float *tw = (
const float *)twiddles;
1406 "ld2 {v0.4s, v1.4s}, [%[tw]], %[offset] \n\t" 1407 "ld2 {v2.4s, v3.4s}, [%[tw]], %[offset] \n\t" 1408 "ld2 {v4.4s, v5.4s}, [%[tw]] \n\t" 1410 "fmul v6.4s, %[in1r].4s, v1.4s \n\t" 1411 "fmul %[in1r].4s, %[in1r].4s, v0.4s \n\t" 1412 "fmls %[in1r].4s, %[in1i].4s, v1.4s \n\t" 1413 "fmul v7.4s, %[in1i].4s, v0.4s \n\t" 1414 "fadd %[in1i].4s, v6.4s, v7.4s \n\t" 1416 "fmul v6.4s, %[in2r].4s, v3.4s \n\t" 1417 "fmul %[in2r].4s, %[in2r].4s, v2.4s \n\t" 1418 "fmls %[in2r].4s, %[in2i].4s, v3.4s \n\t" 1419 "fmul v7.4s, %[in2i].4s, v2.4s \n\t" 1420 "fadd %[in2i].4s, v6.4s, v7.4s \n\t" 1422 "fmul v6.4s, %[in3r].4s, v5.4s \n\t" 1423 "fmul %[in3r].4s, %[in3r].4s, v4.4s \n\t" 1424 "fmls %[in3r].4s, %[in3i].4s, v5.4s \n\t" 1425 "fmul v7.4s, %[in3i].4s, v4.4s \n\t" 1426 "fadd %[in3i].4s, v6.4s, v7.4s \n\t" 1428 [in1r]
"+w"(scratch_in[1].val[0]),
1429 [in1i]
"+w"(scratch_in[1].val[1]),
1430 [in2r]
"+w"(scratch_in[2].val[0]),
1431 [in2i]
"+w"(scratch_in[2].val[1]),
1432 [in3r]
"+w"(scratch_in[3].val[0]),
1433 [in3i]
"+w"(scratch_in[3].val[1])
1434 : [offset]
"r"(out_step * 8)
1435 :
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
1437 #endif // __aarch64__ 1438 #endif // NE10_INLINE_ASM_OPT 1447 #ifndef NE10_INLINE_ASM_OPT 1451 scratch_out[1].val[0] = scratch[1].val[0] + scratch[3].val[1];
1452 scratch_out[1].val[1] = scratch[1].val[1] - scratch[3].val[0];
1453 scratch_out[3].val[0] = scratch[1].val[0] - scratch[3].val[1];
1454 scratch_out[3].val[1] = scratch[1].val[1] + scratch[3].val[0];
1456 if (is_inverse == 1)
1458 NE10_FFT4_CONJ (scratch_out);
1471 #else // NE10_INLINE_ASM_OPT 1473 #error Currently, inline assembly optimizations are only available on AArch64. 1474 #else // __aarch64__ 1476 "fadd v0.4s, %[s0r].4s, %[s2r].4s \n\t" 1477 "fadd v1.4s, %[s0i].4s, %[s2i].4s \n\t" 1478 "fsub v4.4s, %[s0r].4s, %[s2r].4s \n\t" 1479 "fsub v5.4s, %[s0i].4s, %[s2i].4s \n\t" 1480 "fadd v2.4s, %[s1r].4s, %[s3i].4s \n\t" 1481 "fsub v3.4s, %[s1i].4s, %[s3r].4s \n\t" 1482 "fsub v6.4s, %[s1r].4s, %[s3i].4s \n\t" 1483 "fadd v7.4s, %[s1i].4s, %[s3r].4s \n\t" 1485 : [s0r]
"w"(scratch[0].val[0]),
1486 [s0i]
"w"(scratch[0].val[1]),
1487 [s1r]
"w"(scratch[1].val[0]),
1488 [s1i]
"w"(scratch[1].val[1]),
1489 [s2r]
"w"(scratch[2].val[0]),
1490 [s2i]
"w"(scratch[2].val[1]),
1491 [s3r]
"w"(scratch[3].val[0]),
1492 [s3i]
"w"(scratch[3].val[1])
1493 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7");
1495 if (is_inverse == 1)
1498 "fneg v1.4s, v1.4s \n\t" 1499 "fneg v3.4s, v3.4s \n\t" 1500 "fneg v5.4s, v5.4s \n\t" 1501 "fneg v7.4s, v7.4s \n\t" 1504 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7");
1507 float *pout = (
float *) Fout;
1509 "st2 {v0.4s, v1.4s}, [%[pout]], %[offset] \n\t" 1510 "st2 {v2.4s, v3.4s}, [%[pout]], %[offset] \n\t" 1511 "st2 {v4.4s, v5.4s}, [%[pout]], %[offset] \n\t" 1512 "st2 {v6.4s, v7.4s}, [%[pout]] \n\t" 1514 : [offset]
"r"(out_step * 8)
1515 :
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7");
1516 #endif // __aarch64__ 1517 #endif // NE10_INLINE_ASM_OPT 1533 for (m_count = out_step % 4; m_count > 0; m_count--)
1538 scratch_in[0] = Fin_s[0];
1539 scratch_in[1] = Fin_s[1];
1540 scratch_in[2] = Fin_s[2];
1541 scratch_in[3] = Fin_s[3];
1545 scratch_in[0].
i = -scratch_in[0].
i;
1546 scratch_in[1].
i = -scratch_in[1].
i;
1547 scratch_in[2].
i = -scratch_in[2].
i;
1548 scratch_in[3].
i = -scratch_in[3].
i;
1551 scratch_tw[0] = twiddles[0 * out_step];
1552 scratch_tw[1] = twiddles[1 * out_step];
1553 scratch_tw[2] = twiddles[2 * out_step];
1555 FFT4_MUL_TW (scratch_in, scratch_in, scratch_tw);
1557 FFT4_FCU_INPLACE (scratch_in);
1561 scratch_in[0].
i = -scratch_in[0].
i;
1562 scratch_in[1].
i = -scratch_in[1].
i;
1563 scratch_in[2].
i = -scratch_in[2].
i;
1564 scratch_in[3].
i = -scratch_in[3].
i;
1567 Fout_s[0 * out_step] = scratch_in[0];
1568 Fout_s[1 * out_step] = scratch_in[1];
1569 Fout_s[2 * out_step] = scratch_in[2];
1570 Fout_s[3 * out_step] = scratch_in[3];
1601 ne10_mixed_radix_impl = ne10_mixed_radix_generic_butterfly_float32_neon_impl<0, true>;
1605 ne10_mixed_radix_impl = ne10_mixed_radix_generic_butterfly_float32_neon_impl<0, false>;
1608 ne10_mixed_radix_impl ((
CPLX *) buffer,
1614 ne10_c2c_1d_last_stage_neon<0> ((
CPLX *) Fout,
1615 (
const CPLX *) buffer,
1642 ne10_mixed_radix_impl = ne10_mixed_radix_generic_butterfly_float32_neon_impl<1, true>;
1646 ne10_mixed_radix_impl = ne10_mixed_radix_generic_butterfly_float32_neon_impl<1, false>;
1649 ne10_mixed_radix_impl ((
CPLX *) buffer,
1655 ne10_c2c_1d_last_stage_neon<1> ((
CPLX *) Fout,
1656 (
const CPLX *) buffer,
#define NE10_FFT_PARA_LEVEL
#define NE10_REAL_DUP_NEON_F32
#define ne10_swap_ptr(X, Y)
void ne10_mixed_radix_generic_butterfly_float32_neon(ne10_fft_cpx_float32_t *Fout, const ne10_fft_cpx_float32_t *Fin, const ne10_int32_t *factors, const ne10_fft_cpx_float32_t *twiddles, ne10_fft_cpx_float32_t *buffer, const ne10_int32_t is_scaled)
#define NE10_BUTTERFLY_INDEX_NEON_F32(OUT, IN, OUT_I, OUT_J, IN_I, IN_J)
#define NE10_CPLX_STORE(PTR, OUT)
void NE10_LOAD_TW_AND_MUL(CPLX scratch_in[RADIX], const ne10_fft_cpx_int32_t *ptr_in, const ne10_int32_t step)
#define NE10_CPX_ADD_NEON_F32(Z, A, B)
#define NE10_CPLX_LOAD(PTR)
#define NE10_RADIX4X4C_TRANSPOSE_NEON(Q2_OUT, Q2_IN)
#define NE10_CPX_MUL_TW8_NEON_F32(OUT, TW_8_TABLE, OUT_I, TW_J)
void(* NE10_MIXED_RADIX_FUNC)(CPLX *, const CPLX *, const ne10_int32_t *, const ne10_fft_cpx_float32_t *, CPLX *)
#define NE10_CPX_SUB_NEON_F32(Z, A, B)
void ne10_mixed_radix_generic_butterfly_inverse_float32_neon(ne10_fft_cpx_float32_t *Fout, const ne10_fft_cpx_float32_t *Fin, const ne10_int32_t *factors, const ne10_fft_cpx_float32_t *twiddles, ne10_fft_cpx_float32_t *buffer, const ne10_int32_t is_scaled)
#define NE10_CPX_MUL_NEON_F32(Z, A, B)