51 #ifndef NE10_FFT_GENERIC_INT32_NEONINTRINSIC_H 52 #define NE10_FFT_GENERIC_INT32_NEONINTRINSIC_H 61 #define NE10_REAL_DUP_NEON_S32 vdupq_n_s32 63 #ifndef NE10_INLINE_ASM_OPT 64 #define NE10_CPLX_LOAD(PTR) vld2q_s32 ((ne10_int32_t*) (PTR)) 65 #define NE10_CPLX_STORE(PTR,OUT) \ 67 vst2q_s32 ((ne10_int32_t*) (PTR), OUT); \ 69 #else // NE10_INLINE_ASM_OPT 71 #error Currently, inline assembly optimizations are only available on AArch64. 78 "ld2 {v0.4s, v1.4s}, [%[pin]] \n\t" 79 "mov %[r].16b, v0.16b \n\t" 80 "mov %[i].16b, v1.16b \n\t" 81 : [r]
"+w"(result.val[0]),
82 [i]
"+w"(result.val[1])
84 :
"memory",
"v0",
"v1");
92 "mov v0.16b, %[r].16b \n\t" 93 "mov v1.16b, %[i].16b \n\t" 94 "st2 {v0.4s, v1.4s}, [%[pout]] \n\t" 95 : [r]
"+w"(out.val[0]),
98 :
"memory",
"v0",
"v1");
101 #endif // __aarch64__ 102 #endif // NE10_INLINE_ASM_OPT 132 static inline REAL NE10_S_MUL_NEON_S32 (
const REAL vec,
136 REAL result = vqrdmulhq_s32 (scalar_neon, vec);
140 static inline void NE10_CPX_MUL_NEON_S32 (
CPLX &result,
const CPLX A,
const CPLX B)
142 REAL ARBR = vqrdmulhq_s32 (A.val[0], B.val[0]);
143 REAL ARBI = vqrdmulhq_s32 (A.val[0], B.val[1]);
144 REAL AIBR = vqrdmulhq_s32 (A.val[1], B.val[0]);
145 REAL AIBI = vqrdmulhq_s32 (A.val[1], B.val[1]);
146 result.val[0] = ARBR - AIBI;
147 result.val[1] = ARBI + AIBR;
156 int32x2_t d2_tmp = vld1_s32 ((
ne10_int32_t *)(ptr_in + (RADIX - 2) * step));
160 NE10_CPX_MUL_NEON_S32 (scratch_in[RADIX - 1], scratch_in[RADIX - 1], scratch_tw);
178 cplx.val[1] = -cplx.val[1];
191 template<
int RADIX,
int SIZE = RADIX>
195 #ifdef NE10_DSP_CFFT_SCALING 196 const int32x4_t one_by_RADIX =
203 scratch_out[SIZE - 1].val[0] = vqrdmulhq_s32 (scratch_out[SIZE - 1].val[0], one_by_RADIX);
204 scratch_out[SIZE - 1].val[1] = vqrdmulhq_s32 (scratch_out[SIZE - 1].val[1], one_by_RADIX);
214 #ifdef NE10_DSP_CFFT_SCALING 215 const int32x4_t one_by_RADIX =
222 scratch_out[0].val[0] = vqrdmulhq_s32 (scratch_out[0].val[0], one_by_RADIX);
223 scratch_out[0].val[1] = vqrdmulhq_s32 (scratch_out[0].val[1], one_by_RADIX);
230 result.val[0] = vaddq_s32 (a.val[0], b.val[0]);
231 result.val[1] = vaddq_s32 (a.val[1], b.val[1]);
236 result.val[0] = vsubq_s32 (a.val[0], b.val[0]);
237 result.val[1] = vsubq_s32 (a.val[1], b.val[1]);
242 const int32x4_t CONST_HALF_NEON = { -1, -1, -1, -1};
243 src = vshlq_s32 (src, CONST_HALF_NEON);
258 const CPLX scratch_in[2])
274 scratch[1] = Fout[1];
275 scratch[2] = Fout[2];
280 Fout[1].val[0] = Fout[0].val[0] -
NE10_HALF (scratch[3].val[0]);
281 Fout[1].val[1] = Fout[0].val[1] -
NE10_HALF (scratch[3].val[1]);
283 scratch[0].val[0] = NE10_S_MUL_NEON_S32 (scratch[0].val[0], TW_3IN_S32);
284 scratch[0].val[1] = NE10_S_MUL_NEON_S32 (scratch[0].val[1], TW_3IN_S32);
286 Fout[0].val[0] += scratch[3].val[0];
287 Fout[0].val[1] += scratch[3].val[1];
289 Fout[2].val[0] = Fout[1].val[0] + scratch[0].val[1];
290 Fout[2].val[1] = Fout[1].val[1] - scratch[0].val[0];
292 Fout[1].val[0] -= scratch[0].val[1];
293 Fout[1].val[1] += scratch[0].val[0];
298 const CPLX scratch_in[4])
310 scratch_out[1].val[0] = scratch[1].val[0] + scratch[3].val[1];
311 scratch_out[1].val[1] = scratch[1].val[1] - scratch[3].val[0];
312 scratch_out[3].val[0] = scratch[1].val[0] - scratch[3].val[1];
313 scratch_out[3].val[1] = scratch[1].val[1] + scratch[3].val[0];
320 CPLX scratch[13], scratch_in[5];
322 scratch_in[0] = Fin[0];
323 scratch_in[1] = Fin[1];
324 scratch_in[2] = Fin[2];
325 scratch_in[3] = Fin[3];
326 scratch_in[4] = Fin[4];
328 scratch[0] = scratch_in[0];
329 scratch[1] = scratch_in[1];
330 scratch[2] = scratch_in[2];
331 scratch[3] = scratch_in[3];
332 scratch[4] = scratch_in[4];
339 scratch_in[0].val[0] += scratch[7].val[0] + scratch[8].val[0];
340 scratch_in[0].val[1] += scratch[7].val[1] + scratch[8].val[1];
342 scratch[5].val[0] = scratch[0].val[0]
343 + NE10_S_MUL_NEON_S32 (scratch[7].val[0], TW_5A_S32.r)
344 + NE10_S_MUL_NEON_S32 (scratch[8].val[0], TW_5B_S32.r);
345 scratch[5].val[1] = scratch[0].val[1]
346 + NE10_S_MUL_NEON_S32 (scratch[7].val[1], TW_5A_S32.r)
347 + NE10_S_MUL_NEON_S32 (scratch[8].val[1], TW_5B_S32.r);
349 scratch[6].val[0] = NE10_S_MUL_NEON_S32 (scratch[10].val[1], TW_5A_S32.i)
350 + NE10_S_MUL_NEON_S32 (scratch[9].val[1], TW_5B_S32.i);
351 scratch[6].val[1] = -NE10_S_MUL_NEON_S32 (scratch[10].val[0], TW_5A_S32.i)
352 - NE10_S_MUL_NEON_S32 (scratch[9].val[0], TW_5B_S32.i);
357 scratch[11].val[0] = scratch[0].val[0]
358 + NE10_S_MUL_NEON_S32 (scratch[7].val[0], TW_5B_S32.r)
359 + NE10_S_MUL_NEON_S32 (scratch[8].val[0], TW_5A_S32.r);
360 scratch[11].val[1] = scratch[0].val[1]
361 + NE10_S_MUL_NEON_S32 (scratch[7].val[1], TW_5B_S32.r)
362 + NE10_S_MUL_NEON_S32 (scratch[8].val[1], TW_5A_S32.r);
364 scratch[12].val[0] = -NE10_S_MUL_NEON_S32 (scratch[10].val[1], TW_5B_S32.i)
365 + NE10_S_MUL_NEON_S32 (scratch[9].val[1], TW_5A_S32.i);
366 scratch[12].val[1] = NE10_S_MUL_NEON_S32 (scratch[10].val[0], TW_5B_S32.i)
367 - NE10_S_MUL_NEON_S32 (scratch[9].val[0], TW_5A_S32.i);
372 Fout[0] = scratch_in[0];
373 Fout[1] = scratch_in[1];
374 Fout[2] = scratch_in[2];
375 Fout[3] = scratch_in[3];
376 Fout[4] = scratch_in[4];
382 template<ne10_
int32_t RADIX,
bool is_first_stage,
bool is_inverse,
bool is_scaled>
383 static __attribute__ ((noinline))
void ne10_radix_butterfly_int32_neon (
396 for (f_count = fstride; f_count > 0; f_count--)
398 for (m_count = out_step; m_count > 0; m_count--)
403 NE10_LOAD_BY_STEP<RADIX, CPLX> (in, Fin, in_step);
407 NE10_CONJ<RADIX> (in);
417 NE10_LOAD_TW_AND_MUL<RADIX> (in, twiddles, out_step);
420 NE10_FFT_FCU_NEON_S32<RADIX> (out, in);
424 NE10_CONJ<RADIX> (out);
427 NE10_STORE_BY_STEP<RADIX, CPLX> (Fout, out, out_step);
443 twiddles -= out_step;
444 Fout += (RADIX - 1) * out_step;
449 template<
bool is_inverse,
bool is_scaled>
450 static void ne10_mixed_radix_generic_butterfly_int32_neon_impl (
CPLX *Fout,
461 stage_count = factors[0];
462 fstride = factors[1];
464 radix = factors[ stage_count << 1 ];
465 nfft = fstride * radix;
468 if (stage_count % 2 == 0)
477 ne10_radix_butterfly_int32_neon<2, true, is_inverse, is_scaled> (Fout, Fin,
482 ne10_radix_butterfly_int32_neon<4, true, is_inverse, is_scaled> (Fout, Fin,
487 ne10_radix_butterfly_int32_neon<3, true, is_inverse, is_scaled> (Fout, Fin,
492 ne10_radix_butterfly_int32_neon<5, true, is_inverse, is_scaled> (Fout, Fin,
511 radix = factors[ stage_count << 1 ];
514 while (stage_count > 0)
517 assert ((radix > 1) && (radix < 6));
525 ne10_radix_butterfly_int32_neon<2, false, is_inverse, is_scaled> (Fout, buffer,
527 fstride, mstride, nfft);
530 ne10_radix_butterfly_int32_neon<3, false, is_inverse, is_scaled> (Fout, buffer,
532 fstride, mstride, nfft);
535 ne10_radix_butterfly_int32_neon<4, false, is_inverse, is_scaled> (Fout, buffer,
537 fstride, mstride, nfft);
540 ne10_radix_butterfly_int32_neon<5, false, is_inverse, is_scaled> (Fout, buffer,
541 twiddles, fstride, mstride, nfft);
545 twiddles += mstride * (radix - 1);
549 radix = factors[ stage_count << 1 ];
553 template<
bool is_inverse,
bool is_scaled>
554 static void ne10_c2c_1d_last_stage_neon (
CPLX *Fout,
564 for (f_count = fstride; f_count > 0; f_count--)
583 float32x4x2_t scratch0, scratch_in0;
584 float32x4x2_t scratch1, scratch_in1;
585 float32x4x2_t scratch2, scratch_in2;
586 float32x4x2_t scratch3, scratch_in3;
588 scratch_in0.val[0] = vreinterpretq_f32_s32 (scratch_in[0].val[0]);
589 scratch_in1.val[0] = vreinterpretq_f32_s32 (scratch_in[1].val[0]);
590 scratch_in2.val[0] = vreinterpretq_f32_s32 (scratch_in[2].val[0]);
591 scratch_in3.val[0] = vreinterpretq_f32_s32 (scratch_in[3].val[0]);
592 scratch_in0.val[1] = vreinterpretq_f32_s32 (scratch_in[0].val[1]);
593 scratch_in1.val[1] = vreinterpretq_f32_s32 (scratch_in[1].val[1]);
594 scratch_in2.val[1] = vreinterpretq_f32_s32 (scratch_in[2].val[1]);
595 scratch_in3.val[1] = vreinterpretq_f32_s32 (scratch_in[3].val[1]);
599 scratch_in[0].val[0] = vreinterpretq_s32_f32 (scratch0.val[0]);
600 scratch_in[1].val[0] = vreinterpretq_s32_f32 (scratch1.val[0]);
601 scratch_in[2].val[0] = vreinterpretq_s32_f32 (scratch2.val[0]);
602 scratch_in[3].val[0] = vreinterpretq_s32_f32 (scratch3.val[0]);
603 scratch_in[0].val[1] = vreinterpretq_s32_f32 (scratch0.val[1]);
604 scratch_in[1].val[1] = vreinterpretq_s32_f32 (scratch1.val[1]);
605 scratch_in[2].val[1] = vreinterpretq_s32_f32 (scratch2.val[1]);
606 scratch_in[3].val[1] = vreinterpretq_s32_f32 (scratch3.val[1]);
611 NE10_CONJ<4, CPLX> (scratch_in);
622 NE10_CPX_MUL_NEON_S32 (scratch_in[1], scratch_in[1], scratch_tw[0]);
623 NE10_CPX_MUL_NEON_S32 (scratch_in[2], scratch_in[2], scratch_tw[1]);
624 NE10_CPX_MUL_NEON_S32 (scratch_in[3], scratch_in[3], scratch_tw[2]);
631 NE10_CONJ<4, CPLX> (scratch_out);
660 for (m_count = out_step % 4; m_count > 0; m_count--)
665 scratch_in[0] = Fin_s[0];
666 scratch_in[1] = Fin_s[1];
667 scratch_in[2] = Fin_s[2];
668 scratch_in[3] = Fin_s[3];
672 scratch_in[0].
r = scratch_in[0].
r >> 2;
673 scratch_in[1].
r = scratch_in[1].
r >> 2;
674 scratch_in[2].
r = scratch_in[2].
r >> 2;
675 scratch_in[3].
r = scratch_in[3].
r >> 2;
677 scratch_in[0].
i = scratch_in[0].
i >> 2;
678 scratch_in[1].
i = scratch_in[1].
i >> 2;
679 scratch_in[2].
i = scratch_in[2].
i >> 2;
680 scratch_in[3].
i = scratch_in[3].
i >> 2;
686 scratch_in[0].
i = -scratch_in[0].
i;
687 scratch_in[1].
i = -scratch_in[1].
i;
688 scratch_in[2].
i = -scratch_in[2].
i;
689 scratch_in[3].
i = -scratch_in[3].
i;
692 scratch_tw[0] = twiddles[0 * out_step];
693 scratch_tw[1] = twiddles[1 * out_step];
694 scratch_tw[2] = twiddles[2 * out_step];
704 scratch_in[0].i = -scratch_in[0].i;
705 scratch_in[1].i = -scratch_in[1].i;
706 scratch_in[2].i = -scratch_in[2].i;
707 scratch_in[3].i = -scratch_in[3].i;
710 Fout_s[0 * out_step] = scratch_in[0];
711 Fout_s[1 * out_step] = scratch_in[1];
712 Fout_s[2 * out_step] = scratch_in[2];
713 Fout_s[3 * out_step] = scratch_in[3];
#define NE10_FFT_PARA_LEVEL
void operator()(CPLX scratch_out[RADIX])
CPLX NE10_CPX_LOAD_S< CPLX >(const CPLX *ptr)
void NE10_FFT_FCU_NEON_S32(CPLX[RADIX], const CPLX[RADIX])
#define NE10_CPLX_LOAD(PTR)
#define ne10_swap_ptr(X, Y)
void NE10_LOAD_BY_STEP< 1, CPLX >(CPLX out[1], const CPLX *Fin, const ne10_int32_t)
T NE10_CPX_LOAD_S(const T *ptr)
void NE10_CONJ< 1, CPLX >(CPLX in[1])
void NE10_CPX_SUB_NEON_S32(CPLX &result, const CPLX a, const CPLX b)
void NE10_LOAD_TW_AND_MUL(CPLX scratch_in[RADIX], const ne10_fft_cpx_int32_t *ptr_in, const ne10_int32_t step)
Structure for the 32-bit fixed point FFT function.
#define NE10_CPLX_STORE(PTR, OUT)
#define NE10_CPX_MUL_S32(Z, A, B)
#define NE10_RADIX4X4C_TRANSPOSE_NEON(Q2_OUT, Q2_IN)
#define NE10_REAL_DUP_NEON_S32
void NE10_CONJ_S< CPLX >(CPLX &cplx)
void NE10_CPX_STORE_S< CPLX >(CPLX *ptr, const CPLX out)
void FFT_FCU< 4 >(ne10_fft_cpx_int32_t scratch_out[4], const ne10_fft_cpx_int32_t scratch_in[4])
Basic fixed-point radix-4 butterfly used in each stage.
void NE10_CPX_STORE_S(T *Fout, const T in)
void NE10_FFT_FCU_NEON_S32< 2 >(CPLX scratch_out[2], const CPLX scratch_in[2])
void NE10_FFT_FCU_NEON_S32< 5 >(CPLX Fout[5], const CPLX Fin[5])
void NE10_FFT_FCU_NEON_S32< 3 >(CPLX Fout[3], const CPLX Fin[3])
void NE10_CPX_ADD_NEON_S32(CPLX &result, const CPLX a, const CPLX b)
void NE10_LOAD_TW_AND_MUL< 1 >(CPLX[1], const ne10_fft_cpx_int32_t *, const ne10_int32_t)
void NE10_STORE_BY_STEP< 1, CPLX >(CPLX *Fout, const CPLX out[1], const ne10_int32_t)
void NE10_FFT_FCU_NEON_S32< 4 >(CPLX scratch_out[4], const CPLX scratch_in[4])