39 #define FFT4_FS_START \    40     ne10_int32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i; \    41     ne10_int32_t tmp_r, tmp_i;    45     s2_r = Fin[0].r - Fin[2].r; \    46     s2_i = Fin[0].i - Fin[2].i; \    47     tmp_r = Fin[0].r + Fin[2].r; \    48     tmp_i = Fin[0].i + Fin[2].i; \    49     s0_r = Fin[1].r + Fin[3].r; \    50     s0_i = Fin[1].i + Fin[3].i; \    51     s1_r = Fin[1].r - Fin[3].r; \    52     s1_i = Fin[1].i - Fin[3].i;    54 #define FFT4_FS_SCALED \    55     s2_r = (Fin[0].r - Fin[2].r) >> 2; \    56     s2_i = (Fin[0].i - Fin[2].i) >> 2; \    57     tmp_r = (Fin[0].r + Fin[2].r) >> 2; \    58     tmp_i = (Fin[0].i + Fin[2].i) >> 2; \    59     s0_r = (Fin[1].r + Fin[3].r) >> 2; \    60     s0_i = (Fin[1].i + Fin[3].i) >> 2; \    61     s1_r = (Fin[1].r - Fin[3].r) >> 2; \    62     s1_i = (Fin[1].i - Fin[3].i) >> 2;    65     Fout[2].r = tmp_r - s0_r; \    66     Fout[2].i = tmp_i - s0_i; \    67     Fout[0].r = tmp_r + s0_r; \    68     Fout[0].i = tmp_i + s0_i; \    69     Fout[1].r = s2_r + s1_i; \    70     Fout[1].i = s2_i - s1_r; \    71     Fout[3].r = s2_r - s1_i; \    72     Fout[3].i = s2_i + s1_r;    75     Fout[2].r = tmp_r - s0_r; \    76     Fout[2].i = tmp_i - s0_i; \    77     Fout[0].r = tmp_r + s0_r; \    78     Fout[0].i = tmp_i + s0_i; \    79     Fout[1].r = s2_r - s1_i; \    80     Fout[1].i = s2_i + s1_r; \    81     Fout[3].r = s2_r + s1_i; \    82     Fout[3].i = s2_i - s1_r;   119 #define FFT8_FS_START \   120     ne10_int32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i; \   121     ne10_int32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i; \   122     const ne10_int32_t TW_81 = 1518500249;   125     s0_r = Fin[0].r + Fin[4].r; \   126     s0_i = Fin[0].i + Fin[4].i; \   127     s1_r = Fin[0].r - Fin[4].r; \   128     s1_i = Fin[0].i - Fin[4].i; \   129     s2_r = Fin[1].r + Fin[5].r; \   130     s2_i = Fin[1].i + Fin[5].i; \   131     s3_r = Fin[1].r - Fin[5].r; \   132     s3_i = Fin[1].i - Fin[5].i; \   133     s4_r = Fin[2].r + Fin[6].r; \   134     s4_i = Fin[2].i + Fin[6].i; \   135     s5_r = Fin[2].r - Fin[6].r; \   136     s5_i = Fin[2].i - Fin[6].i; \   137     s6_r = Fin[3].r + Fin[7].r; \   138     s6_i = Fin[3].i + Fin[7].i; \   139     s7_r = Fin[3].r - Fin[7].r; \   140     s7_i = Fin[3].i - Fin[7].i;   142 #define FFT8_FS_SCALED \   143     s0_r = (Fin[0].r + Fin[4].r) >> 3; \   144     s0_i = (Fin[0].i + Fin[4].i) >> 3; \   145     s1_r = (Fin[0].r - Fin[4].r) >> 3; \   146     s1_i = (Fin[0].i - Fin[4].i) >> 3; \   147     s2_r = (Fin[1].r + Fin[5].r) >> 3; \   148     s2_i = (Fin[1].i + Fin[5].i) >> 3; \   149     s3_r = (Fin[1].r - Fin[5].r) >> 3; \   150     s3_i = (Fin[1].i - Fin[5].i) >> 3; \   151     s4_r = (Fin[2].r + Fin[6].r) >> 3; \   152     s4_i = (Fin[2].i + Fin[6].i) >> 3; \   153     s5_r = (Fin[2].r - Fin[6].r) >> 3; \   154     s5_i = (Fin[2].i - Fin[6].i) >> 3; \   155     s6_r = (Fin[3].r + Fin[7].r) >> 3; \   156     s6_i = (Fin[3].i + Fin[7].i) >> 3; \   157     s7_r = (Fin[3].r - Fin[7].r) >> 3; \   158     s7_i = (Fin[3].i - Fin[7].i) >> 3;   161 #define FFT8_FWD_LS \   162     t0_r = s0_r - s4_r; \   163     t0_i = s0_i - s4_i; \   164     t1_r = s0_r + s4_r; \   165     t1_i = s0_i + s4_i; \   166     t2_r = s2_r + s6_r; \   167     t2_i = s2_i + s6_i; \   168     t3_r = s2_r - s6_r; \   169     t3_i = s2_i - s6_i; \   170     Fout[0].r = t1_r + t2_r; \   171     Fout[0].i = t1_i + t2_i; \   172     Fout[4].r = t1_r - t2_r; \   173     Fout[4].i = t1_i - t2_i; \   174     Fout[2].r = t0_r + t3_i; \   175     Fout[2].i = t0_i - t3_r; \   176     Fout[6].r = t0_r - t3_i; \   177     Fout[6].i = t0_i + t3_r; \   178     t4_r = (ne10_int32_t) ( ( (ne10_int64_t) (s3_r + s3_i) * TW_81) >> 31); \   179     t4_i = - (ne10_int32_t) ( ( (ne10_int64_t) (s3_r - s3_i) * TW_81) >> 31); \   180     t5_r = (ne10_int32_t) ( ( (ne10_int64_t) (s7_r - s7_i) * TW_81) >> 31); \   181     t5_i = (ne10_int32_t) ( ( (ne10_int64_t) (s7_r + s7_i) * TW_81) >> 31); \   182     t0_r = s1_r - s5_i; \   183     t0_i = s1_i + s5_r; \   184     t1_r = s1_r + s5_i; \   185     t1_i = s1_i - s5_r; \   186     t2_r = t4_r - t5_r; \   187     t2_i = t4_i - t5_i; \   188     t3_r = t4_r + t5_r; \   189     t3_i = t4_i + t5_i; \   190     Fout[1].r = t1_r + t2_r; \   191     Fout[1].i = t1_i + t2_i; \   192     Fout[5].r = t1_r - t2_r; \   193     Fout[5].i = t1_i - t2_i; \   194     Fout[3].r = t0_r + t3_i; \   195     Fout[3].i = t0_i - t3_r; \   196     Fout[7].r = t0_r - t3_i; \   197     Fout[7].i = t0_i + t3_r;   199 #define FFT8_INV_LS \   200     t0_r = s0_r - s4_r; \   201     t0_i = s0_i - s4_i; \   202     t1_r = s0_r + s4_r; \   203     t1_i = s0_i + s4_i; \   204     t2_r = s2_r + s6_r; \   205     t2_i = s2_i + s6_i; \   206     t3_r = s2_r - s6_r; \   207     t3_i = s2_i - s6_i; \   208     Fout[0].r = t1_r + t2_r; \   209     Fout[0].i = t1_i + t2_i; \   210     Fout[4].r = t1_r - t2_r; \   211     Fout[4].i = t1_i - t2_i; \   212     Fout[2].r = t0_r - t3_i; \   213     Fout[2].i = t0_i + t3_r; \   214     Fout[6].r = t0_r + t3_i; \   215     Fout[6].i = t0_i - t3_r; \   216     t4_r = (ne10_int32_t) ( ( (ne10_int64_t) (s3_r - s3_i) * TW_81) >> 31); \   217     t4_i = (ne10_int32_t) ( ( (ne10_int64_t) (s3_r + s3_i) * TW_81) >> 31); \   218     t5_r = (ne10_int32_t) ( ( (ne10_int64_t) (s7_r + s7_i) * TW_81) >> 31); \   219     t5_i = - (ne10_int32_t) ( ( (ne10_int64_t) (s7_r - s7_i) * TW_81) >> 31); \   220     t0_r = s1_r + s5_i; \   221     t0_i = s1_i - s5_r; \   222     t1_r = s1_r - s5_i; \   223     t1_i = s1_i + s5_r; \   224     t2_r = t4_r - t5_r; \   225     t2_i = t4_i - t5_i; \   226     t3_r = t4_r + t5_r; \   227     t3_i = t4_i + t5_i; \   228     Fout[1].r = t1_r + t2_r; \   229     Fout[1].i = t1_i + t2_i; \   230     Fout[5].r = t1_r - t2_r; \   231     Fout[5].i = t1_i - t2_i; \   232     Fout[3].r = t0_r - t3_i; \   233     Fout[3].i = t0_i + t3_r; \   234     Fout[7].r = t0_r + t3_i; \   235     Fout[7].i = t0_i - t3_r;   271 #define FFT16_FS_START \   272     ne10_fft_cpx_int32_t *tw1, *tw2, *tw3; \   273     int32_t *p_src0, *p_src4, *p_src8, *p_src12; \   274     int32x4x2_t q2_in_0123, q2_in_4567, q2_in_89ab, q2_in_cdef; \   275     int32x4_t q_t0_r,  q_t0_i, q_t1_r,  q_t1_i, q_t2_r,  q_t2_i, q_t3_r, q_t3_i; \   276     int32x4_t q_out_r048c,  q_out_i048c, q_out_r159d,  q_out_i159d; \   277     int32x4_t q_out_r26ae,  q_out_i26ae, q_out_r37bf,  q_out_i37bf;   279 #define FFT16_LS_START \   280     int32_t *p_dst0, *p_dst1, *p_dst2, *p_dst3; \   281     int32_t *p_tw1, *p_tw2, *p_tw3; \   282     int32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i; \   283     int32x4_t q_s3_r, q_s3_i, q_s4_r, q_s4_i, q_s5_r, q_s5_i; \   284     int32x4x2_t q2_tmp_0, q2_tmp_1, q2_tmp_2, q2_tmp_3; \   285     int32x4_t q_in_r0123, q_in_r4567, q_in_r89ab, q_in_rcdef; \   286     int32x4_t q_in_i0123, q_in_i4567, q_in_i89ab, q_in_icdef; \   287     int32x4x2_t q2_tw1, q2_tw2, q2_tw3; \   288     int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3, q_tmp4, q_tmp5; \   289     int32x4x2_t q2_out_0123, q2_out_4567, q2_out_89ab, q2_out_cdef;   292     p_src0 = (int32_t*) (& (Fin[0])); \   293     p_src4 = (int32_t*) (& (Fin[4])); \   294     p_src8 = (int32_t*) (& (Fin[8])); \   295     p_src12 = (int32_t*) (& (Fin[12])); \   296     q2_in_0123 = vld2q_s32 (p_src0); \   297     q2_in_4567 = vld2q_s32 (p_src4); \   298     q2_in_89ab = vld2q_s32 (p_src8); \   299     q2_in_cdef = vld2q_s32 (p_src12); \   300     q_t2_r = vsubq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]); \   301     q_t2_i = vsubq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]); \   302     q_t3_r = vaddq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]); \   303     q_t3_i = vaddq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]); \   304     q_t0_r = vaddq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]); \   305     q_t0_i = vaddq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]); \   306     q_t1_r = vsubq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]); \   307     q_t1_i = vsubq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]); \   308     q_out_r26ae = vsubq_s32 (q_t3_r, q_t0_r); \   309     q_out_i26ae = vsubq_s32 (q_t3_i, q_t0_i); \   310     q_out_r048c = vaddq_s32 (q_t3_r, q_t0_r); \   311     q_out_i048c = vaddq_s32 (q_t3_i, q_t0_i);   313 #define FFT16_FS_SCALED \   314     p_src0 = (int32_t*) (& (Fin[0])); \   315     p_src4 = (int32_t*) (& (Fin[4])); \   316     p_src8 = (int32_t*) (& (Fin[8])); \   317     p_src12 = (int32_t*) (& (Fin[12])); \   318     q2_in_0123 = vld2q_s32 (p_src0); \   319     q2_in_4567 = vld2q_s32 (p_src4); \   320     q2_in_89ab = vld2q_s32 (p_src8); \   321     q2_in_cdef = vld2q_s32 (p_src12); \   322     q_t2_r = vhsubq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]); \   323     q_t2_i = vhsubq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]); \   324     q_t3_r = vhaddq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]); \   325     q_t3_i = vhaddq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]); \   326     q_t0_r = vhaddq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]); \   327     q_t0_i = vhaddq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]); \   328     q_t1_r = vhsubq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]); \   329     q_t1_i = vhsubq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]); \   330     q_out_r26ae = vhsubq_s32 (q_t3_r, q_t0_r); \   331     q_out_i26ae = vhsubq_s32 (q_t3_i, q_t0_i); \   332     q_out_r048c = vhaddq_s32 (q_t3_r, q_t0_r); \   333     q_out_i048c = vhaddq_s32 (q_t3_i, q_t0_i);   335 #define FFT16_LS_LOAD \   337     tw2 = twiddles + 4; \   338     tw3 = twiddles + 8; \   339     p_dst0 = (int32_t*) (&Fout[0]); \   340     p_dst1 = (int32_t*) (&Fout[4]); \   341     p_dst2 = (int32_t*) (&Fout[8]); \   342     p_dst3 = (int32_t*) (&Fout[12]); \   343     p_tw1 = (int32_t*) tw1; \   344     p_tw2 = (int32_t*) tw2; \   345     p_tw3 = (int32_t*) tw3; \   346     q2_tmp_0 = vzipq_s32 (q_out_r048c, q_out_r159d); \   347     q2_tmp_1 = vzipq_s32 (q_out_i048c, q_out_i159d); \   348     q2_tmp_2 = vzipq_s32 (q_out_r26ae, q_out_r37bf); \   349     q2_tmp_3 = vzipq_s32 (q_out_i26ae, q_out_i37bf); \   350     q_in_r0123 = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[0]), vget_low_s32 (q2_tmp_2.val[0])); \   351     q_in_i0123 = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[0]), vget_low_s32 (q2_tmp_3.val[0])); \   352     q_in_r4567 = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[0]), vget_high_s32 (q2_tmp_2.val[0])); \   353     q_in_i4567 = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[0]), vget_high_s32 (q2_tmp_3.val[0])); \   354     q_in_r89ab = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[1]), vget_low_s32 (q2_tmp_2.val[1])); \   355     q_in_i89ab = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[1]), vget_low_s32 (q2_tmp_3.val[1])); \   356     q_in_rcdef = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[1]), vget_high_s32 (q2_tmp_2.val[1])); \   357     q_in_icdef = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[1]), vget_high_s32 (q2_tmp_3.val[1])); \   358     q2_tw1 = vld2q_s32 (p_tw1); \   359     q2_tw2 = vld2q_s32 (p_tw2); \   360     q2_tw3 = vld2q_s32 (p_tw3);   362 #define FFT16_FWD_LS \   363     q_s0_r = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[0]); \   364     q_s0_i = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[1]); \   365     q_s1_r = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[0]); \   366     q_s1_i = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[1]); \   367     q_s2_r = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[0]); \   368     q_s2_i = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[1]); \   369     q_tmp0 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[1]); \   370     q_tmp1 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[0]); \   371     q_tmp2 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[1]); \   372     q_tmp3 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[0]); \   373     q_tmp4 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[1]); \   374     q_tmp5 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[0]);   376 #define FFT16_INV_LS \   377     q_s0_r = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[0]); \   378     q_s0_i = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[0]); \   379     q_s1_r = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[0]); \   380     q_s1_i = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[0]); \   381     q_s2_r = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[0]); \   382     q_s2_i = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[0]); \   383     q_tmp0 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[1]); \   384     q_tmp1 = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[1]); \   385     q_tmp2 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[1]); \   386     q_tmp3 = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[1]); \   387     q_tmp4 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[1]); \   388     q_tmp5 = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[1]);   390 #define FFT16_FWD_LS_S0 \   391     q_s0_r = vsubq_s32 (q_s0_r, q_tmp0); \   392     q_s0_i = vaddq_s32 (q_s0_i, q_tmp1); \   393     q_s1_r = vsubq_s32 (q_s1_r, q_tmp2); \   394     q_s1_i = vaddq_s32 (q_s1_i, q_tmp3); \   395     q_s2_r = vsubq_s32 (q_s2_r, q_tmp4); \   396     q_s2_i = vaddq_s32 (q_s2_i, q_tmp5);   398 #define FFT16_INV_LS_S0 \   399     q_s0_r = vaddq_s32 (q_s0_r, q_tmp0); \   400     q_s0_i = vsubq_s32 (q_s0_i, q_tmp1); \   401     q_s1_r = vaddq_s32 (q_s1_r, q_tmp2); \   402     q_s1_i = vsubq_s32 (q_s1_i, q_tmp3); \   403     q_s2_r = vaddq_s32 (q_s2_r, q_tmp4); \   404     q_s2_i = vsubq_s32 (q_s2_i, q_tmp5);   406 #define FFT16_LS_02 \   407     q_s5_r = vsubq_s32 (q_in_r0123, q_s1_r); \   408     q_s5_i = vsubq_s32 (q_in_i0123, q_s1_i); \   409     q2_out_0123.val[0] = vaddq_s32 (q_in_r0123, q_s1_r); \   410     q2_out_0123.val[1] = vaddq_s32 (q_in_i0123, q_s1_i); \   411     q_s3_r = vaddq_s32 (q_s0_r, q_s2_r); \   412     q_s3_i = vaddq_s32 (q_s0_i, q_s2_i); \   413     q_s4_r = vsubq_s32 (q_s0_r, q_s2_r); \   414     q_s4_i = vsubq_s32 (q_s0_i, q_s2_i); \   415     q2_out_89ab.val[0] = vsubq_s32 (q2_out_0123.val[0], q_s3_r); \   416     q2_out_89ab.val[1] = vsubq_s32 (q2_out_0123.val[1], q_s3_i); \   417     q2_out_0123.val[0] = vaddq_s32 (q2_out_0123.val[0], q_s3_r); \   418     q2_out_0123.val[1] = vaddq_s32 (q2_out_0123.val[1], q_s3_i);   421 #define FFT16_LS_02_SCALED \   422     q_s5_r = vhsubq_s32 (q_in_r0123, q_s1_r); \   423     q_s5_i = vhsubq_s32 (q_in_i0123, q_s1_i); \   424     q2_out_0123.val[0] = vhaddq_s32 (q_in_r0123, q_s1_r); \   425     q2_out_0123.val[1] = vhaddq_s32 (q_in_i0123, q_s1_i); \   426     q_s3_r = vhaddq_s32 (q_s0_r, q_s2_r); \   427     q_s3_i = vhaddq_s32 (q_s0_i, q_s2_i); \   428     q_s4_r = vhsubq_s32 (q_s0_r, q_s2_r); \   429     q_s4_i = vhsubq_s32 (q_s0_i, q_s2_i); \   430     q2_out_89ab.val[0] = vhsubq_s32 (q2_out_0123.val[0], q_s3_r); \   431     q2_out_89ab.val[1] = vhsubq_s32 (q2_out_0123.val[1], q_s3_i); \   432     q2_out_0123.val[0] = vhaddq_s32 (q2_out_0123.val[0], q_s3_r); \   433     q2_out_0123.val[1] = vhaddq_s32 (q2_out_0123.val[1], q_s3_i);   436     vst2q_s32 (p_dst0, q2_out_0123); \   437     vst2q_s32 (p_dst1, q2_out_4567); \   438     vst2q_s32 (p_dst2, q2_out_89ab); \   439     vst2q_s32 (p_dst3, q2_out_cdef);   448     q_out_r159d = vaddq_s32 (q_t2_r, q_t1_i);
   449     q_out_i159d = vsubq_s32 (q_t2_i, q_t1_r);
   450     q_out_r37bf = vsubq_s32 (q_t2_r, q_t1_i);
   451     q_out_i37bf = vaddq_s32 (q_t2_i, q_t1_r);
   460     q2_out_4567.val[0] = vaddq_s32 (q_s5_r, q_s4_i);
   461     q2_out_4567.val[1] = vsubq_s32 (q_s5_i, q_s4_r);
   462     q2_out_cdef.val[0] = vsubq_s32 (q_s5_r, q_s4_i);
   463     q2_out_cdef.val[1] = vaddq_s32 (q_s5_i, q_s4_r);
   475     q_out_r159d = vsubq_s32 (q_t2_r, q_t1_i);
   476     q_out_i159d = vaddq_s32 (q_t2_i, q_t1_r);
   477     q_out_r37bf = vaddq_s32 (q_t2_r, q_t1_i);
   478     q_out_i37bf = vsubq_s32 (q_t2_i, q_t1_r);
   487     q2_out_4567.val[0] = vsubq_s32 (q_s5_r, q_s4_i);
   488     q2_out_4567.val[1] = vaddq_s32 (q_s5_i, q_s4_r);
   489     q2_out_cdef.val[0] = vaddq_s32 (q_s5_r, q_s4_i);
   490     q2_out_cdef.val[1] = vsubq_s32 (q_s5_i, q_s4_r);
   502     q_out_r159d = vhaddq_s32 (q_t2_r, q_t1_i);
   503     q_out_i159d = vhsubq_s32 (q_t2_i, q_t1_r);
   504     q_out_r37bf = vhsubq_s32 (q_t2_r, q_t1_i);
   505     q_out_i37bf = vhaddq_s32 (q_t2_i, q_t1_r);
   514     q2_out_4567.val[0] = vhaddq_s32 (q_s5_r, q_s4_i);
   515     q2_out_4567.val[1] = vhsubq_s32 (q_s5_i, q_s4_r);
   516     q2_out_cdef.val[0] = vhsubq_s32 (q_s5_r, q_s4_i);
   517     q2_out_cdef.val[1] = vhaddq_s32 (q_s5_i, q_s4_r);
   529     q_out_r159d = vhsubq_s32 (q_t2_r, q_t1_i);
   530     q_out_i159d = vhaddq_s32 (q_t2_i, q_t1_r);
   531     q_out_r37bf = vhaddq_s32 (q_t2_r, q_t1_i);
   532     q_out_i37bf = vhsubq_s32 (q_t2_i, q_t1_r);
   541     q2_out_4567.val[0] = vhsubq_s32 (q_s5_r, q_s4_i);
   542     q2_out_4567.val[1] = vhaddq_s32 (q_s5_i, q_s4_r);
   543     q2_out_cdef.val[0] = vhaddq_s32 (q_s5_r, q_s4_i);
   544     q2_out_cdef.val[1] = vhsubq_s32 (q_s5_i, q_s4_r);
   550 #define RADIX8x4_START \   551     ne10_int32_t f_count; \   552     ne10_int32_t src_step = stride << 1; \   553     const ne10_int32_t TW_81 = 1518500249; \   554     const ne10_int32_t TW_81N = -1518500249; \   555     int32_t *p_src, *p_dst; \   556     int32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3, q2_in4, q2_in5, q2_in6, q2_in7; \   557     int32x4_t q_sin0_r, q_sin0_i, q_sin1_r, q_sin1_i, q_sin2_r, q_sin2_i, q_sin3_r, q_sin3_i; \   558     int32x4_t q_sin4_r, q_sin4_i, q_sin5_r, q_sin5_i, q_sin6_r, q_sin6_i, q_sin7_r, q_sin7_i; \   559     int32x4_t q_s3_r, q_s3_i, q_s5_r, q_s5_i, q_s7_r, q_s7_i; \   560     int32x4_t q_s8_r, q_s8_i, q_s9_r, q_s9_i, q_s10_r, q_s10_i, q_s11_r, q_s11_i; \   561     int32x4_t q_s12_r, q_s12_i, q_s13_r, q_s13_i, q_s14_r, q_s14_i, q_s15_r, q_s15_i; \   562     int32x4_t q_out0_r, q_out0_i, q_out1_r, q_out1_i, q_out2_r, q_out2_i, q_out3_r, q_out3_i; \   563     int32x4_t q_out4_r, q_out4_i, q_out5_r, q_out5_i, q_out6_r, q_out6_i, q_out7_r, q_out7_i; \   564     int32x4x2_t q2_tmp0, q2_tmp1, q2_tmp2, q2_tmp3, q2_tmp4, q2_tmp5, q2_tmp6, q2_tmp7; \   565     int32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3, q2_out4, q2_out5, q2_out6, q2_out7; \   566     int32x4_t q_tw_81, q_tw_81n; \   567     p_src = (int32_t *) Fin; \   568     p_dst = (int32_t *) Fout;   571 #define RADIX8x4_LOAD \   572     q2_in0 = vld2q_s32 (p_src); \   574     q2_in2 = vld2q_s32 (p_src); \   576     q2_in4 = vld2q_s32 (p_src); \   578     q2_in6 = vld2q_s32 (p_src); \   580     q2_in1 = vld2q_s32 (p_src); \   582     q2_in3 = vld2q_s32 (p_src); \   584     q2_in5 = vld2q_s32 (p_src); \   586     q2_in7 = vld2q_s32 (p_src); \   589 #define RADIX8x4_STORE \   590     q2_tmp0 = vtrnq_s32 (q_out0_r, q_out1_r); \   591     q2_tmp1 = vtrnq_s32 (q_out0_i, q_out1_i); \   592     q2_tmp2 = vtrnq_s32 (q_out2_r, q_out3_r); \   593     q2_tmp3 = vtrnq_s32 (q_out2_i, q_out3_i); \   594     q2_tmp4 = vtrnq_s32 (q_out4_r, q_out5_r); \   595     q2_tmp5 = vtrnq_s32 (q_out4_i, q_out5_i); \   596     q2_tmp6 = vtrnq_s32 (q_out6_r, q_out7_r); \   597     q2_tmp7 = vtrnq_s32 (q_out6_i, q_out7_i); \   598     q2_out0.val[0] = vcombine_s32 (vget_low_s32 (q2_tmp0.val[0]), vget_low_s32 (q2_tmp2.val[0])); \   599     q2_out0.val[1] = vcombine_s32 (vget_low_s32 (q2_tmp1.val[0]), vget_low_s32 (q2_tmp3.val[0])); \   600     q2_out2.val[0] = vcombine_s32 (vget_low_s32 (q2_tmp0.val[1]), vget_low_s32 (q2_tmp2.val[1])); \   601     q2_out2.val[1] = vcombine_s32 (vget_low_s32 (q2_tmp1.val[1]), vget_low_s32 (q2_tmp3.val[1])); \   602     q2_out4.val[0] = vcombine_s32 (vget_high_s32 (q2_tmp0.val[0]), vget_high_s32 (q2_tmp2.val[0])); \   603     q2_out4.val[1] = vcombine_s32 (vget_high_s32 (q2_tmp1.val[0]), vget_high_s32 (q2_tmp3.val[0])); \   604     q2_out6.val[0] = vcombine_s32 (vget_high_s32 (q2_tmp0.val[1]), vget_high_s32 (q2_tmp2.val[1])); \   605     q2_out6.val[1] = vcombine_s32 (vget_high_s32 (q2_tmp1.val[1]), vget_high_s32 (q2_tmp3.val[1])); \   606     q2_out1.val[0] = vcombine_s32 (vget_low_s32 (q2_tmp4.val[0]), vget_low_s32 (q2_tmp6.val[0])); \   607     q2_out1.val[1] = vcombine_s32 (vget_low_s32 (q2_tmp5.val[0]), vget_low_s32 (q2_tmp7.val[0])); \   608     q2_out3.val[0] = vcombine_s32 (vget_low_s32 (q2_tmp4.val[1]), vget_low_s32 (q2_tmp6.val[1])); \   609     q2_out3.val[1] = vcombine_s32 (vget_low_s32 (q2_tmp5.val[1]), vget_low_s32 (q2_tmp7.val[1])); \   610     q2_out5.val[0] = vcombine_s32 (vget_high_s32 (q2_tmp4.val[0]), vget_high_s32 (q2_tmp6.val[0])); \   611     q2_out5.val[1] = vcombine_s32 (vget_high_s32 (q2_tmp5.val[0]), vget_high_s32 (q2_tmp7.val[0])); \   612     q2_out7.val[0] = vcombine_s32 (vget_high_s32 (q2_tmp4.val[1]), vget_high_s32 (q2_tmp6.val[1])); \   613     q2_out7.val[1] = vcombine_s32 (vget_high_s32 (q2_tmp5.val[1]), vget_high_s32 (q2_tmp7.val[1])); \   614     vst2q_s32 (p_dst, q2_out0); \   616     vst2q_s32 (p_dst, q2_out1); \   618     vst2q_s32 (p_dst, q2_out2); \   620     vst2q_s32 (p_dst, q2_out3); \   622     vst2q_s32 (p_dst, q2_out4); \   624     vst2q_s32 (p_dst, q2_out5); \   626     vst2q_s32 (p_dst, q2_out6); \   628     vst2q_s32 (p_dst, q2_out7); \   630     p_src = p_src - src_step * 8 + 8;   632 #define RADIX8x4_FS_S0 \   633     q_sin0_r = vaddq_s32 (q2_in0.val[0], q2_in1.val[0]); \   634     q_sin0_i = vaddq_s32 (q2_in0.val[1], q2_in1.val[1]); \   635     q_sin1_r = vsubq_s32 (q2_in0.val[0], q2_in1.val[0]); \   636     q_sin1_i = vsubq_s32 (q2_in0.val[1], q2_in1.val[1]); \   637     q_sin2_r = vaddq_s32 (q2_in2.val[0], q2_in3.val[0]); \   638     q_sin2_i = vaddq_s32 (q2_in2.val[1], q2_in3.val[1]); \   639     q_sin3_r = vsubq_s32 (q2_in2.val[0], q2_in3.val[0]); \   640     q_sin3_i = vsubq_s32 (q2_in2.val[1], q2_in3.val[1]); \   641     q_sin4_r = vaddq_s32 (q2_in4.val[0], q2_in5.val[0]); \   642     q_sin4_i = vaddq_s32 (q2_in4.val[1], q2_in5.val[1]); \   643     q_sin5_r = vsubq_s32 (q2_in4.val[0], q2_in5.val[0]); \   644     q_sin5_i = vsubq_s32 (q2_in4.val[1], q2_in5.val[1]); \   645     q_sin6_r = vaddq_s32 (q2_in6.val[0], q2_in7.val[0]); \   646     q_sin6_i = vaddq_s32 (q2_in6.val[1], q2_in7.val[1]); \   647     q_sin7_r = vsubq_s32 (q2_in6.val[0], q2_in7.val[0]); \   648     q_sin7_i = vsubq_s32 (q2_in6.val[1], q2_in7.val[1]);   650 #define RADIX8x4_FWD_S357 \   651     q_tw_81 = vdupq_n_s32 (TW_81); \   652     q_tw_81n = vdupq_n_s32 (TW_81N); \   654     q_s5_i = vnegq_s32 (q_sin5_r); \   655     q_s3_r = vaddq_s32 (q_sin3_r, q_sin3_i); \   656     q_s3_i = vsubq_s32 (q_sin3_i, q_sin3_r); \   657     q_s7_r = vsubq_s32 (q_sin7_r, q_sin7_i); \   658     q_s7_i = vaddq_s32 (q_sin7_i, q_sin7_r); \   659     q_s3_r = vqdmulhq_s32 (q_s3_r, q_tw_81); \   660     q_s3_i = vqdmulhq_s32 (q_s3_i, q_tw_81); \   661     q_s7_r = vqdmulhq_s32 (q_s7_r, q_tw_81n); \   662     q_s7_i = vqdmulhq_s32 (q_s7_i, q_tw_81n);   664 #define RADIX8x4_INV_S357 \   665     q_tw_81 = vdupq_n_s32 (TW_81); \   666     q_tw_81n = vdupq_n_s32 (TW_81N); \   667     q_s5_r = vnegq_s32 (q_sin5_i); \   669     q_s3_r = vsubq_s32 (q_sin3_r, q_sin3_i); \   670     q_s3_i = vaddq_s32 (q_sin3_i, q_sin3_r); \   671     q_s7_r = vaddq_s32 (q_sin7_r, q_sin7_i); \   672     q_s7_i = vsubq_s32 (q_sin7_i, q_sin7_r); \   673     q_s3_r = vqdmulhq_s32 (q_s3_r, q_tw_81); \   674     q_s3_i = vqdmulhq_s32 (q_s3_i, q_tw_81); \   675     q_s7_r = vqdmulhq_s32 (q_s7_r, q_tw_81n); \   676     q_s7_i = vqdmulhq_s32 (q_s7_i, q_tw_81n);   678 #define RADIX8x4_LS_02 \   679     q_s8_r = vaddq_s32 (q_sin0_r, q_sin4_r); \   680     q_s8_i = vaddq_s32 (q_sin0_i, q_sin4_i); \   681     q_s9_r = vaddq_s32 (q_sin1_r, q_s5_r); \   682     q_s9_i = vaddq_s32 (q_sin1_i, q_s5_i); \   683     q_s10_r = vsubq_s32 (q_sin0_r, q_sin4_r); \   684     q_s10_i = vsubq_s32 (q_sin0_i, q_sin4_i); \   685     q_s11_r = vsubq_s32 (q_sin1_r, q_s5_r); \   686     q_s11_i = vsubq_s32 (q_sin1_i, q_s5_i); \   687     q_s12_r = vaddq_s32 (q_sin2_r, q_sin6_r); \   688     q_s12_i = vaddq_s32 (q_sin2_i, q_sin6_i); \   689     q_s13_r = vaddq_s32 (q_s3_r, q_s7_r); \   690     q_s13_i = vaddq_s32 (q_s3_i, q_s7_i); \   691     q_s14_r = vsubq_s32 (q_sin2_r, q_sin6_r); \   692     q_s14_i = vsubq_s32 (q_sin2_i, q_sin6_i); \   693     q_s15_r = vsubq_s32 (q_s3_r, q_s7_r); \   694     q_s15_i = vsubq_s32 (q_s3_i, q_s7_i); \   695     q_out4_r = vsubq_s32 (q_s8_r, q_s12_r); \   696     q_out4_i = vsubq_s32 (q_s8_i, q_s12_i); \   697     q_out5_r = vsubq_s32 (q_s9_r, q_s13_r); \   698     q_out5_i = vsubq_s32 (q_s9_i, q_s13_i); \   699     q_out0_r = vaddq_s32 (q_s8_r, q_s12_r); \   700     q_out0_i = vaddq_s32 (q_s8_i, q_s12_i); \   701     q_out1_r = vaddq_s32 (q_s9_r, q_s13_r); \   702     q_out1_i = vaddq_s32 (q_s9_i, q_s13_i);   704 #define RADIX8x4_FS_S0_SCALED \   705     q_sin0_r = vhaddq_s32 (q2_in0.val[0], q2_in1.val[0]); \   706     q_sin0_i = vhaddq_s32 (q2_in0.val[1], q2_in1.val[1]); \   707     q_sin1_r = vhsubq_s32 (q2_in0.val[0], q2_in1.val[0]); \   708     q_sin1_i = vhsubq_s32 (q2_in0.val[1], q2_in1.val[1]); \   709     q_sin2_r = vhaddq_s32 (q2_in2.val[0], q2_in3.val[0]); \   710     q_sin2_i = vhaddq_s32 (q2_in2.val[1], q2_in3.val[1]); \   711     q_sin3_r = vhsubq_s32 (q2_in2.val[0], q2_in3.val[0]); \   712     q_sin3_i = vhsubq_s32 (q2_in2.val[1], q2_in3.val[1]); \   713     q_sin4_r = vhaddq_s32 (q2_in4.val[0], q2_in5.val[0]); \   714     q_sin4_i = vhaddq_s32 (q2_in4.val[1], q2_in5.val[1]); \   715     q_sin5_r = vhsubq_s32 (q2_in4.val[0], q2_in5.val[0]); \   716     q_sin5_i = vhsubq_s32 (q2_in4.val[1], q2_in5.val[1]); \   717     q_sin6_r = vhaddq_s32 (q2_in6.val[0], q2_in7.val[0]); \   718     q_sin6_i = vhaddq_s32 (q2_in6.val[1], q2_in7.val[1]); \   719     q_sin7_r = vhsubq_s32 (q2_in6.val[0], q2_in7.val[0]); \   720     q_sin7_i = vhsubq_s32 (q2_in6.val[1], q2_in7.val[1]);   722 #define RADIX8x4_LS_02_SCALED \   723     q_s8_r = vhaddq_s32 (q_sin0_r, q_sin4_r); \   724     q_s8_i = vhaddq_s32 (q_sin0_i, q_sin4_i); \   725     q_s9_r = vhaddq_s32 (q_sin1_r, q_s5_r); \   726     q_s9_i = vhaddq_s32 (q_sin1_i, q_s5_i); \   727     q_s10_r = vhsubq_s32 (q_sin0_r, q_sin4_r); \   728     q_s10_i = vhsubq_s32 (q_sin0_i, q_sin4_i); \   729     q_s11_r = vhsubq_s32 (q_sin1_r, q_s5_r); \   730     q_s11_i = vhsubq_s32 (q_sin1_i, q_s5_i); \   731     q_s12_r = vhaddq_s32 (q_sin2_r, q_sin6_r); \   732     q_s12_i = vhaddq_s32 (q_sin2_i, q_sin6_i); \   733     q_s13_r = vhaddq_s32 (q_s3_r, q_s7_r); \   734     q_s13_i = vhaddq_s32 (q_s3_i, q_s7_i); \   735     q_s14_r = vhsubq_s32 (q_sin2_r, q_sin6_r); \   736     q_s14_i = vhsubq_s32 (q_sin2_i, q_sin6_i); \   737     q_s15_r = vhsubq_s32 (q_s3_r, q_s7_r); \   738     q_s15_i = vhsubq_s32 (q_s3_i, q_s7_i); \   739     q_out4_r = vhsubq_s32 (q_s8_r, q_s12_r); \   740     q_out4_i = vhsubq_s32 (q_s8_i, q_s12_i); \   741     q_out5_r = vhsubq_s32 (q_s9_r, q_s13_r); \   742     q_out5_i = vhsubq_s32 (q_s9_i, q_s13_i); \   743     q_out0_r = vhaddq_s32 (q_s8_r, q_s12_r); \   744     q_out0_i = vhaddq_s32 (q_s8_i, q_s12_i); \   745     q_out1_r = vhaddq_s32 (q_s9_r, q_s13_r); \   746     q_out1_i = vhaddq_s32 (q_s9_i, q_s13_i);   755     for (f_count = 0; f_count < stride; f_count += 4)
   765         q_out2_r = vaddq_s32 (q_s10_r, q_s14_i);
   766         q_out2_i = vsubq_s32 (q_s10_i, q_s14_r);
   767         q_out3_r = vaddq_s32 (q_s11_r, q_s15_i);
   768         q_out3_i = vsubq_s32 (q_s11_i, q_s15_r);
   769         q_out6_r = vsubq_s32 (q_s10_r, q_s14_i);
   770         q_out6_i = vaddq_s32 (q_s10_i, q_s14_r);
   771         q_out7_r = vsubq_s32 (q_s11_r, q_s15_i);
   772         q_out7_i = vaddq_s32 (q_s11_i, q_s15_r);
   784     for (f_count = 0; f_count < stride; f_count += 4)
   793         q_out2_r = vsubq_s32 (q_s10_r, q_s14_i);
   794         q_out2_i = vaddq_s32 (q_s10_i, q_s14_r);
   795         q_out3_r = vsubq_s32 (q_s11_r, q_s15_i);
   796         q_out3_i = vaddq_s32 (q_s11_i, q_s15_r);
   797         q_out6_r = vaddq_s32 (q_s10_r, q_s14_i);
   798         q_out6_i = vsubq_s32 (q_s10_i, q_s14_r);
   799         q_out7_r = vaddq_s32 (q_s11_r, q_s15_i);
   800         q_out7_i = vsubq_s32 (q_s11_i, q_s15_r);
   811     for (f_count = 0; f_count < stride; f_count += 4)
   820         q_out2_r = vhaddq_s32 (q_s10_r, q_s14_i);
   821         q_out2_i = vhsubq_s32 (q_s10_i, q_s14_r);
   822         q_out3_r = vhaddq_s32 (q_s11_r, q_s15_i);
   823         q_out3_i = vhsubq_s32 (q_s11_i, q_s15_r);
   824         q_out6_r = vhsubq_s32 (q_s10_r, q_s14_i);
   825         q_out6_i = vhaddq_s32 (q_s10_i, q_s14_r);
   826         q_out7_r = vhsubq_s32 (q_s11_r, q_s15_i);
   827         q_out7_i = vhaddq_s32 (q_s11_i, q_s15_r);
   839     for (f_count = 0; f_count < stride; f_count += 4)
   848         q_out2_r = vhsubq_s32 (q_s10_r, q_s14_i);
   849         q_out2_i = vhaddq_s32 (q_s10_i, q_s14_r);
   850         q_out3_r = vhsubq_s32 (q_s11_r, q_s15_i);
   851         q_out3_i = vhaddq_s32 (q_s11_i, q_s15_r);
   852         q_out6_r = vhaddq_s32 (q_s10_r, q_s14_i);
   853         q_out6_i = vhsubq_s32 (q_s10_i, q_s14_r);
   854         q_out7_r = vhaddq_s32 (q_s11_r, q_s15_i);
   855         q_out7_i = vhsubq_s32 (q_s11_i, q_s15_r);
   861 #define RADIX4x4_WITHOUT_TW_START \   862     ne10_int32_t f_count; \   863     ne10_int32_t src_step = stride << 1; \   864     int32_t *p_src, *p_dst; \   865     int32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3; \   866     int32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i, q_s3_r, q_s3_i; \   867     int32x4_t q_out0_r, q_out0_i, q_out1_r, q_out1_i, q_out2_r, q_out2_i, q_out3_r, q_out3_i; \   868     int32x4x2_t q2_tmp0, q2_tmp1, q2_tmp2, q2_tmp3; \   869     int32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3; \   870     p_src = (int32_t *) Fin; \   871     p_dst = (int32_t *) Fout;   873 #define RADIX4x4_WITHOUT_TW_LOAD \   874     q2_in0 = vld2q_s32 (p_src); \   876     q2_in1 = vld2q_s32 (p_src); \   878     q2_in2 = vld2q_s32 (p_src); \   880     q2_in3 = vld2q_s32 (p_src); \   883 #define RADIX4x4_WITHOUT_TW_STORE \   884     q2_tmp0 = vtrnq_s32 (q_out0_r, q_out1_r); \   885     q2_tmp1 = vtrnq_s32 (q_out0_i, q_out1_i); \   886     q2_tmp2 = vtrnq_s32 (q_out2_r, q_out3_r); \   887     q2_tmp3 = vtrnq_s32 (q_out2_i, q_out3_i); \   888     q2_out0.val[0] = vcombine_s32 (vget_low_s32 (q2_tmp0.val[0]), vget_low_s32 (q2_tmp2.val[0])); \   889     q2_out0.val[1] = vcombine_s32 (vget_low_s32 (q2_tmp1.val[0]), vget_low_s32 (q2_tmp3.val[0])); \   890     q2_out1.val[0] = vcombine_s32 (vget_low_s32 (q2_tmp0.val[1]), vget_low_s32 (q2_tmp2.val[1])); \   891     q2_out1.val[1] = vcombine_s32 (vget_low_s32 (q2_tmp1.val[1]), vget_low_s32 (q2_tmp3.val[1])); \   892     q2_out2.val[0] = vcombine_s32 (vget_high_s32 (q2_tmp0.val[0]), vget_high_s32 (q2_tmp2.val[0])); \   893     q2_out2.val[1] = vcombine_s32 (vget_high_s32 (q2_tmp1.val[0]), vget_high_s32 (q2_tmp3.val[0])); \   894     q2_out3.val[0] = vcombine_s32 (vget_high_s32 (q2_tmp0.val[1]), vget_high_s32 (q2_tmp2.val[1])); \   895     q2_out3.val[1] = vcombine_s32 (vget_high_s32 (q2_tmp1.val[1]), vget_high_s32 (q2_tmp3.val[1])); \   896     vst2q_s32 (p_dst, q2_out0); \   898     vst2q_s32 (p_dst, q2_out1); \   900     vst2q_s32 (p_dst, q2_out2); \   902     vst2q_s32 (p_dst, q2_out3); \   904     p_src = p_src - src_step * 4 + 8;   906 #define RADIX4x4_WITHOUT_TW_S0 \   907         q_s0_r = vaddq_s32 (q2_in0.val[0], q2_in2.val[0]); \   908         q_s0_i = vaddq_s32 (q2_in0.val[1], q2_in2.val[1]); \   909         q_s1_r = vsubq_s32 (q2_in0.val[0], q2_in2.val[0]); \   910         q_s1_i = vsubq_s32 (q2_in0.val[1], q2_in2.val[1]); \   911         q_s2_r = vaddq_s32 (q2_in1.val[0], q2_in3.val[0]); \   912         q_s2_i = vaddq_s32 (q2_in1.val[1], q2_in3.val[1]); \   913         q_s3_r = vsubq_s32 (q2_in1.val[0], q2_in3.val[0]); \   914         q_s3_i = vsubq_s32 (q2_in1.val[1], q2_in3.val[1]); \   915         q_out2_r = vsubq_s32 (q_s0_r, q_s2_r); \   916         q_out2_i = vsubq_s32 (q_s0_i, q_s2_i); \   917         q_out0_r = vaddq_s32 (q_s0_r, q_s2_r); \   918         q_out0_i = vaddq_s32 (q_s0_i, q_s2_i);   920 #define RADIX4x4_WITHOUT_TW_S0_SCALED \   921         q_s0_r = vhaddq_s32 (q2_in0.val[0], q2_in2.val[0]); \   922         q_s0_i = vhaddq_s32 (q2_in0.val[1], q2_in2.val[1]); \   923         q_s1_r = vhsubq_s32 (q2_in0.val[0], q2_in2.val[0]); \   924         q_s1_i = vhsubq_s32 (q2_in0.val[1], q2_in2.val[1]); \   925         q_s2_r = vhaddq_s32 (q2_in1.val[0], q2_in3.val[0]); \   926         q_s2_i = vhaddq_s32 (q2_in1.val[1], q2_in3.val[1]); \   927         q_s3_r = vhsubq_s32 (q2_in1.val[0], q2_in3.val[0]); \   928         q_s3_i = vhsubq_s32 (q2_in1.val[1], q2_in3.val[1]); \   929         q_out2_r = vhsubq_s32 (q_s0_r, q_s2_r); \   930         q_out2_i = vhsubq_s32 (q_s0_i, q_s2_i); \   931         q_out0_r = vhaddq_s32 (q_s0_r, q_s2_r); \   932         q_out0_i = vhaddq_s32 (q_s0_i, q_s2_i);   935 static inline void ne10_radix4x4_without_twiddles_forward_unscaled_neon (
ne10_fft_cpx_int32_t * Fout,
   941     for (f_count = 0; f_count < stride; f_count += 4)
   949         q_out1_r = vaddq_s32 (q_s1_r, q_s3_i);
   950         q_out1_i = vsubq_s32 (q_s1_i, q_s3_r);
   951         q_out3_r = vsubq_s32 (q_s1_r, q_s3_i);
   952         q_out3_i = vaddq_s32 (q_s1_i, q_s3_r);
   958 static inline void ne10_radix4x4_without_twiddles_backward_unscaled_neon (
ne10_fft_cpx_int32_t * Fout,
   964     for (f_count = 0; f_count < stride; f_count += 4)
   972         q_out1_r = vsubq_s32 (q_s1_r, q_s3_i);
   973         q_out1_i = vaddq_s32 (q_s1_i, q_s3_r);
   974         q_out3_r = vaddq_s32 (q_s1_r, q_s3_i);
   975         q_out3_i = vsubq_s32 (q_s1_i, q_s3_r);
   981 static inline void ne10_radix4x4_without_twiddles_forward_scaled_neon (
ne10_fft_cpx_int32_t * Fout,
   987     for (f_count = 0; f_count < stride; f_count += 4)
   995         q_out1_r = vhaddq_s32 (q_s1_r, q_s3_i);
   996         q_out1_i = vhsubq_s32 (q_s1_i, q_s3_r);
   997         q_out3_r = vhsubq_s32 (q_s1_r, q_s3_i);
   998         q_out3_i = vhaddq_s32 (q_s1_i, q_s3_r);
  1004 static inline void ne10_radix4x4_without_twiddles_backward_scaled_neon (
ne10_fft_cpx_int32_t * Fout,
  1010     for (f_count = 0; f_count < stride; f_count += 4)
  1018         q_out1_r = vhsubq_s32 (q_s1_r, q_s3_i);
  1019         q_out1_i = vhaddq_s32 (q_s1_i, q_s3_r);
  1020         q_out3_r = vhaddq_s32 (q_s1_r, q_s3_i);
  1021         q_out3_i = vhsubq_s32 (q_s1_i, q_s3_r);
  1027 #define RADIX4x4_WITH_TW_START \  1028     ne10_int32_t m_count; \  1029     ne10_int32_t src_step = src_stride << 1; \  1030     ne10_int32_t dst_step = dst_stride << 1; \  1031     ne10_int32_t tw_step = mstride << 1; \  1032     int32_t *p_src, *p_dst, *p_tw; \  1033     int32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3; \  1034     int32x4x2_t q2_tw0, q2_tw1, q2_tw2; \  1035     int32x4_t q_s1_r, q_s1_i, q_s2_r, q_s2_i, q_s3_r, q_s3_i; \  1036     int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3, q_tmp4, q_tmp5; \  1037     int32x4_t q_s4_r, q_s4_i, q_s5_r, q_s5_i, q_s6_r, q_s6_i, q_s7_r, q_s7_i; \  1038     int32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3; \  1039     p_src = (int32_t *) Fin; \  1040     p_dst = (int32_t *) Fout; \  1041     p_tw = (int32_t *) tw;  1043 #define RADIX4x4_WITH_TW_LOAD \  1044     q2_in0 = vld2q_s32 (p_src); \  1045     p_src += src_step; \  1046     q2_in1 = vld2q_s32 (p_src); \  1047     p_src += src_step; \  1048     q2_in2 = vld2q_s32 (p_src); \  1049     p_src += src_step; \  1050     q2_in3 = vld2q_s32 (p_src); \  1051     p_src += src_step; \  1052     q2_tw0 = vld2q_s32 (p_tw); \  1054     q2_tw1 = vld2q_s32 (p_tw); \  1056     q2_tw2 = vld2q_s32 (p_tw); \  1057     q_s1_r = vqdmulhq_s32 (q2_in1.val[0], q2_tw0.val[0]); \  1058     q_s1_i = vqdmulhq_s32 (q2_in1.val[1], q2_tw0.val[0]); \  1059     q_s2_r = vqdmulhq_s32 (q2_in2.val[0], q2_tw1.val[0]); \  1060     q_s2_i = vqdmulhq_s32 (q2_in2.val[1], q2_tw1.val[0]); \  1061     q_s3_r = vqdmulhq_s32 (q2_in3.val[0], q2_tw2.val[0]); \  1062     q_s3_i = vqdmulhq_s32 (q2_in3.val[1], q2_tw2.val[0]); \  1063     q_tmp0 = vqdmulhq_s32 (q2_in1.val[1], q2_tw0.val[1]); \  1064     q_tmp1 = vqdmulhq_s32 (q2_in1.val[0], q2_tw0.val[1]); \  1065     q_tmp2 = vqdmulhq_s32 (q2_in2.val[1], q2_tw1.val[1]); \  1066     q_tmp3 = vqdmulhq_s32 (q2_in2.val[0], q2_tw1.val[1]); \  1067     q_tmp4 = vqdmulhq_s32 (q2_in3.val[1], q2_tw2.val[1]); \  1068     q_tmp5 = vqdmulhq_s32 (q2_in3.val[0], q2_tw2.val[1]);  1070 #define RADIX4x4_WITH_TW_STORE \  1071     vst2q_s32 (p_dst, q2_out0); \  1072     p_dst += dst_step; \  1073     vst2q_s32 (p_dst, q2_out1); \  1074     p_dst += dst_step; \  1075     vst2q_s32 (p_dst, q2_out2); \  1076     p_dst += dst_step; \  1077     vst2q_s32 (p_dst, q2_out3); \  1078     p_dst += dst_step; \  1079     p_src = p_src - src_step * 4 + 8; \  1080     p_dst = p_dst - dst_step * 4 + 8; \  1081     p_tw = p_tw - tw_step * 2 + 8;  1083 #define RADIX4x4_WITH_TW_S1_FWD \  1084     q_s1_r = vsubq_s32 (q_s1_r, q_tmp0); \  1085     q_s1_i = vaddq_s32 (q_s1_i, q_tmp1); \  1086     q_s2_r = vsubq_s32 (q_s2_r, q_tmp2); \  1087     q_s2_i = vaddq_s32 (q_s2_i, q_tmp3); \  1088     q_s3_r = vsubq_s32 (q_s3_r, q_tmp4); \  1089     q_s3_i = vaddq_s32 (q_s3_i, q_tmp5);  1091 #define RADIX4x4_WITH_TW_S1_INV \  1092     q_s1_r = vaddq_s32 (q_s1_r, q_tmp0); \  1093     q_s1_i = vsubq_s32 (q_s1_i, q_tmp1); \  1094     q_s2_r = vaddq_s32 (q_s2_r, q_tmp2); \  1095     q_s2_i = vsubq_s32 (q_s2_i, q_tmp3); \  1096     q_s3_r = vaddq_s32 (q_s3_r, q_tmp4); \  1097     q_s3_i = vsubq_s32 (q_s3_i, q_tmp5);  1100 #define RADIX4x4_WITH_TW_LS_02 \  1101     q_s4_r = vaddq_s32 (q2_in0.val[0], q_s2_r); \  1102     q_s4_i = vaddq_s32 (q2_in0.val[1], q_s2_i); \  1103     q_s5_r = vsubq_s32 (q2_in0.val[0], q_s2_r); \  1104     q_s5_i = vsubq_s32 (q2_in0.val[1], q_s2_i); \  1105     q_s6_r = vaddq_s32 (q_s1_r, q_s3_r); \  1106     q_s6_i = vaddq_s32 (q_s1_i, q_s3_i); \  1107     q_s7_r = vsubq_s32 (q_s1_r, q_s3_r); \  1108     q_s7_i = vsubq_s32 (q_s1_i, q_s3_i); \  1109     q2_out2.val[0] = vsubq_s32 (q_s4_r, q_s6_r); \  1110     q2_out2.val[1] = vsubq_s32 (q_s4_i, q_s6_i); \  1111     q2_out0.val[0] = vaddq_s32 (q_s4_r, q_s6_r); \  1112     q2_out0.val[1] = vaddq_s32 (q_s4_i, q_s6_i);  1114 #define RADIX4x4_WITH_TW_LS_02_SCALED \  1115     q_s4_r = vhaddq_s32 (q2_in0.val[0], q_s2_r); \  1116     q_s4_i = vhaddq_s32 (q2_in0.val[1], q_s2_i); \  1117     q_s5_r = vhsubq_s32 (q2_in0.val[0], q_s2_r); \  1118     q_s5_i = vhsubq_s32 (q2_in0.val[1], q_s2_i); \  1119     q_s6_r = vhaddq_s32 (q_s1_r, q_s3_r); \  1120     q_s6_i = vhaddq_s32 (q_s1_i, q_s3_i); \  1121     q_s7_r = vhsubq_s32 (q_s1_r, q_s3_r); \  1122     q_s7_i = vhsubq_s32 (q_s1_i, q_s3_i); \  1123     q2_out2.val[0] = vhsubq_s32 (q_s4_r, q_s6_r); \  1124     q2_out2.val[1] = vhsubq_s32 (q_s4_i, q_s6_i); \  1125     q2_out0.val[0] = vhaddq_s32 (q_s4_r, q_s6_r); \  1126     q2_out0.val[1] = vhaddq_s32 (q_s4_i, q_s6_i);  1129 static inline void ne10_radix4x4_with_twiddles_forward_unscaled_neon (
ne10_fft_cpx_int32_t * Fout,
  1138     for (m_count = 0; m_count < mstride; m_count += 4)
  1146         q2_out1.val[0] = vaddq_s32 (q_s5_r, q_s7_i);
  1147         q2_out1.val[1] = vsubq_s32 (q_s5_i, q_s7_r);
  1148         q2_out3.val[0] = vsubq_s32 (q_s5_r, q_s7_i);
  1149         q2_out3.val[1] = vaddq_s32 (q_s5_i, q_s7_r);
  1157 static inline void ne10_radix4x4_with_twiddles_backward_unscaled_neon (
ne10_fft_cpx_int32_t * Fout,
  1166     for (m_count = 0; m_count < mstride; m_count += 4)
  1174         q2_out1.val[0] = vsubq_s32 (q_s5_r, q_s7_i);
  1175         q2_out1.val[1] = vaddq_s32 (q_s5_i, q_s7_r);
  1176         q2_out3.val[0] = vaddq_s32 (q_s5_r, q_s7_i);
  1177         q2_out3.val[1] = vsubq_s32 (q_s5_i, q_s7_r);
  1186 static inline void ne10_radix4x4_with_twiddles_forward_scaled_neon (
ne10_fft_cpx_int32_t * Fout,
  1195     for (m_count = 0; m_count < mstride; m_count += 4)
  1203         q2_out1.val[0] = vhaddq_s32 (q_s5_r, q_s7_i);
  1204         q2_out1.val[1] = vhsubq_s32 (q_s5_i, q_s7_r);
  1205         q2_out3.val[0] = vhsubq_s32 (q_s5_r, q_s7_i);
  1206         q2_out3.val[1] = vhaddq_s32 (q_s5_i, q_s7_r);
  1213 static inline void ne10_radix4x4_with_twiddles_backward_scaled_neon (
ne10_fft_cpx_int32_t * Fout,
  1222     for (m_count = 0; m_count < mstride; m_count += 4)
  1230         q2_out1.val[0] = vhsubq_s32 (q_s5_r, q_s7_i);
  1231         q2_out1.val[1] = vhaddq_s32 (q_s5_i, q_s7_r);
  1232         q2_out3.val[0] = vhaddq_s32 (q_s5_r, q_s7_i);
  1233         q2_out3.val[1] = vhsubq_s32 (q_s5_i, q_s7_r);
  1240 #define ne10_mixed_radix_fft_forward_int32_neon(scaled) \  1241 void ne10_mixed_radix_fft_forward_int32_##scaled##_neon (ne10_fft_cpx_int32_t * Fout, \  1242         ne10_fft_cpx_int32_t * Fin, \  1243         ne10_int32_t * factors, \  1244         ne10_fft_cpx_int32_t * twiddles, \  1245         ne10_fft_cpx_int32_t * buffer) \  1247     ne10_int32_t fstride, mstride, N; \  1248     ne10_int32_t fstride1; \  1249     ne10_int32_t f_count; \  1250     ne10_int32_t stage_count; \  1252     ne10_fft_cpx_int32_t   *Fin1, *Fout1; \  1253     ne10_fft_cpx_int32_t   *Fout_ls = Fout; \  1254     ne10_fft_cpx_int32_t   *Ftmp; \  1255     ne10_fft_cpx_int32_t   *tw, *tw1; \  1258     stage_count = factors[0]; \  1259     fstride = factors[1]; \  1260     mstride = factors[ (stage_count << 1) - 1 ]; \  1261     N = factors[ stage_count << 1 ];  \  1270         ne10_radix8x4_forward_##scaled##_neon (Fout, Fin, fstride);\  1281         ne10_radix4x4_without_twiddles_forward_##scaled##_neon (Fout, Fin, fstride); \  1292     for (; stage_count > 1 ; stage_count--) \  1295         for (f_count = 0; f_count < fstride; f_count ++) \  1297             Fout1 = & Fout[ f_count * mstride << 2 ]; \  1299             ne10_radix4x4_with_twiddles_forward_##scaled##_neon (Fout1, Fin1, tw1, N, mstride, mstride); \  1302         tw += mstride * 3; \  1314         for (f_count = 0; f_count < fstride; f_count ++) \  1317             ne10_radix4x4_with_twiddles_forward_##scaled##_neon (Fout1, Fin1, tw1, N, N, mstride); \  1324 #define ne10_mixed_radix_fft_backward_int32_neon(scaled) \  1325 void ne10_mixed_radix_fft_backward_int32_##scaled##_neon (ne10_fft_cpx_int32_t * Fout, \  1326         ne10_fft_cpx_int32_t * Fin, \  1327         ne10_int32_t * factors, \  1328         ne10_fft_cpx_int32_t * twiddles, \  1329         ne10_fft_cpx_int32_t * buffer) \  1331     ne10_int32_t fstride, mstride, N; \  1332     ne10_int32_t fstride1; \  1333     ne10_int32_t f_count; \  1334     ne10_int32_t stage_count; \  1336     ne10_fft_cpx_int32_t   *Fin1, *Fout1; \  1337     ne10_fft_cpx_int32_t   *Fout_ls = Fout; \  1338     ne10_fft_cpx_int32_t   *Ftmp; \  1339     ne10_fft_cpx_int32_t   *tw, *tw1; \  1342     stage_count = factors[0]; \  1343     fstride = factors[1]; \  1344     mstride = factors[ (stage_count << 1) - 1 ]; \  1345     N = factors[ stage_count << 1 ];  \  1354         ne10_radix8x4_backward_##scaled##_neon (Fout, Fin, fstride);\  1365         ne10_radix4x4_without_twiddles_backward_##scaled##_neon (Fout, Fin, fstride); \  1376     for (; stage_count > 1 ; stage_count--) \  1379         for (f_count = 0; f_count < fstride; f_count ++) \  1381             Fout1 = & Fout[ f_count * mstride << 2 ]; \  1383             ne10_radix4x4_with_twiddles_backward_##scaled##_neon (Fout1, Fin1, tw1, N, mstride, mstride); \  1386         tw += mstride * 3; \  1398         for (f_count = 0; f_count < fstride; f_count ++) \  1401             ne10_radix4x4_with_twiddles_backward_##scaled##_neon (Fout1, Fin1, tw1, N, N, mstride); \  1423     int32x4x2_t q2_fpk, q2_fpnk, q2_tw, q2_dst, q2_dst2;
  1424     int32x4_t q_fpnk_r, q_fpnk_i;
  1425     int32x4_t q_f1k_r, q_f1k_i, q_f2k_r, q_f2k_i;
  1426     int32x4_t q_tw_r, q_tw_i;
  1427     int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
  1428     int32x4_t q_dst2_r, q_dst2_i;
  1429     int32_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
  1437     dst[0].r = tdc.
r + tdc.
i;
  1438     dst[ncfft].r = tdc.
r - tdc.
i;
  1439     dst[ncfft].i = dst[0].i = 0;
  1445             for (k = 1; k <= count ; k += 4)
  1447                 p_src  = (int32_t*) (& (src[k]));
  1448                 p_src2  = (int32_t*) (& (src[ncfft - k - 3]));
  1449                 p_twiddles  = (int32_t*) (& (twiddles[k - 1]));
  1450                 p_dst  = (int32_t*) (& (dst[k]));
  1451                 p_dst2  = (int32_t*) (& (dst[ncfft - k - 3]));
  1453                 q2_fpk  = vld2q_s32 (p_src);
  1454                 q2_fpnk = vld2q_s32 (p_src2);
  1456                 q2_tw = vld2q_s32 (p_twiddles);
  1457                 q2_fpnk.val[0] = vrev64q_s32 (q2_fpnk.val[0]);
  1458                 q2_fpnk.val[1] = vrev64q_s32 (q2_fpnk.val[1]);
  1459                 q_fpnk_r = vcombine_s32 (vget_high_s32 (q2_fpnk.val[0]), vget_low_s32 (q2_fpnk.val[0]));
  1460                 q_fpnk_i = vcombine_s32 (vget_high_s32 (q2_fpnk.val[1]), vget_low_s32 (q2_fpnk.val[1]));
  1461                 q_fpnk_i = vnegq_s32 (q_fpnk_i);
  1463                 q_f1k_r = vhaddq_s32 (q2_fpk.val[0], q_fpnk_r);
  1464                 q_f1k_i = vhaddq_s32 (q2_fpk.val[1], q_fpnk_i);
  1466                 q_f2k_r = vhsubq_s32 (q2_fpk.val[0], q_fpnk_r);
  1467                 q_f2k_i = vhsubq_s32 (q2_fpk.val[1], q_fpnk_i);
  1469                 q_tmp0 = vqdmulhq_s32 (q_f2k_r, q2_tw.val[0]);
  1470                 q_tmp1 = vqdmulhq_s32 (q_f2k_i, q2_tw.val[1]);
  1471                 q_tmp2 = vqdmulhq_s32 (q_f2k_r, q2_tw.val[1]);
  1472                 q_tmp3 = vqdmulhq_s32 (q_f2k_i, q2_tw.val[0]);
  1473                 q_tw_r = vsubq_s32 (q_tmp0, q_tmp1);
  1474                 q_tw_i = vaddq_s32 (q_tmp2, q_tmp3);
  1476                 q_dst2_r = vhsubq_s32 (q_f1k_r, q_tw_r);
  1477                 q_dst2_i = vhsubq_s32 (q_tw_i, q_f1k_i);
  1478                 q2_dst.val[0] = vhaddq_s32 (q_f1k_r, q_tw_r);
  1479                 q2_dst.val[1] = vhaddq_s32 (q_f1k_i, q_tw_i);
  1480                 q_dst2_r = vrev64q_s32 (q_dst2_r);
  1481                 q_dst2_i = vrev64q_s32 (q_dst2_i);
  1482                 q2_dst2.val[0] = vcombine_s32 (vget_high_s32 (q_dst2_r), vget_low_s32 (q_dst2_r));
  1483                 q2_dst2.val[1] = vcombine_s32 (vget_high_s32 (q_dst2_i), vget_low_s32 (q_dst2_i));
  1484                 vst2q_s32 (p_dst, q2_dst);
  1485                 vst2q_s32 (p_dst2, q2_dst2);
  1491             for (k = 1; k <= count ; k += 4)
  1493                 p_src  = (int32_t*) (& (src[k]));
  1494                 p_src2  = (int32_t*) (& (src[ncfft - k - 3]));
  1495                 p_twiddles  = (int32_t*) (& (twiddles[k - 1]));
  1496                 p_dst  = (int32_t*) (& (dst[k]));
  1497                 p_dst2  = (int32_t*) (& (dst[ncfft - k - 3]));
  1499                 q2_fpk  = vld2q_s32 (p_src);
  1500                 q2_fpnk = vld2q_s32 (p_src2);
  1502                 q2_tw = vld2q_s32 (p_twiddles);
  1503                 q2_fpnk.val[0] = vrev64q_s32 (q2_fpnk.val[0]);
  1504                 q2_fpnk.val[1] = vrev64q_s32 (q2_fpnk.val[1]);
  1505                 q_fpnk_r = vcombine_s32 (vget_high_s32 (q2_fpnk.val[0]), vget_low_s32 (q2_fpnk.val[0]));
  1506                 q_fpnk_i = vcombine_s32 (vget_high_s32 (q2_fpnk.val[1]), vget_low_s32 (q2_fpnk.val[1]));
  1507                 q_fpnk_i = vnegq_s32 (q_fpnk_i);
  1509                 q_f1k_r = vaddq_s32 (q2_fpk.val[0], q_fpnk_r);
  1510                 q_f1k_i = vaddq_s32 (q2_fpk.val[1], q_fpnk_i);
  1512                 q_f2k_r = vsubq_s32 (q2_fpk.val[0], q_fpnk_r);
  1513                 q_f2k_i = vsubq_s32 (q2_fpk.val[1], q_fpnk_i);
  1515                 q_tmp0 = vqdmulhq_s32 (q_f2k_r, q2_tw.val[0]);
  1516                 q_tmp1 = vqdmulhq_s32 (q_f2k_i, q2_tw.val[1]);
  1517                 q_tmp2 = vqdmulhq_s32 (q_f2k_r, q2_tw.val[1]);
  1518                 q_tmp3 = vqdmulhq_s32 (q_f2k_i, q2_tw.val[0]);
  1519                 q_tw_r = vsubq_s32 (q_tmp0, q_tmp1);
  1520                 q_tw_i = vaddq_s32 (q_tmp2, q_tmp3);
  1522                 q_dst2_r = vhsubq_s32 (q_f1k_r, q_tw_r);
  1523                 q_dst2_i = vhsubq_s32 (q_tw_i, q_f1k_i);
  1524                 q2_dst.val[0] = vhaddq_s32 (q_f1k_r, q_tw_r);
  1525                 q2_dst.val[1] = vhaddq_s32 (q_f1k_i, q_tw_i);
  1526                 q_dst2_r = vrev64q_s32 (q_dst2_r);
  1527                 q_dst2_i = vrev64q_s32 (q_dst2_i);
  1528                 q2_dst2.val[0] = vcombine_s32 (vget_high_s32 (q_dst2_r), vget_low_s32 (q_dst2_r));
  1529                 q2_dst2.val[1] = vcombine_s32 (vget_high_s32 (q_dst2_i), vget_low_s32 (q_dst2_i));
  1530                 vst2q_s32 (p_dst, q2_dst);
  1531                 vst2q_s32 (p_dst2, q2_dst2);
  1539         for (k = 1; k <= ncfft / 2 ; ++k)
  1542             fpnk.
r =   src[ncfft - k].r;
  1543             fpnk.
i = - src[ncfft - k].i;
  1550             f1k.
r = fpk.
r + fpnk.
r;
  1551             f1k.
i = fpk.
i + fpnk.
i;
  1553             f2k.
r = fpk.
r - fpnk.
r;
  1554             f2k.
i = fpk.
i - fpnk.
i;
  1559             dst[k].r = (f1k.
r + tw.
r) >> 1;
  1560             dst[k].i = (f1k.
i + tw.
i) >> 1;
  1561             dst[ncfft - k].r = (f1k.
r - tw.
r) >> 1;
  1562             dst[ncfft - k].i = (tw.
i - f1k.
i) >> 1;
  1577     int32x4x2_t q2_fk, q2_fnkc, q2_tw, q2_dst, q2_dst2;
  1578     int32x4_t q_fnkc_r, q_fnkc_i;
  1579     int32x4_t q_fek_r, q_fek_i, q_fok_r, q_fok_i;
  1580     int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
  1581     int32x4_t q_dst2_r, q_dst2_i;
  1582     int32_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
  1585     dst[0].
r = src[0].
r + src[ncfft].
r;
  1586     dst[0].
i = src[0].
r - src[ncfft].
r;
  1593             for (k = 1; k <= count ; k += 4)
  1595                 p_src  = (int32_t*) (& (src[k]));
  1596                 p_src2  = (int32_t*) (& (src[ncfft - k - 3]));
  1597                 p_twiddles  = (int32_t*) (& (twiddles[k - 1]));
  1598                 p_dst  = (int32_t*) (& (dst[k]));
  1599                 p_dst2  = (int32_t*) (& (dst[ncfft - k - 3]));
  1601                 q2_fk  = vld2q_s32 (p_src);
  1602                 q2_fnkc = vld2q_s32 (p_src2);
  1603                 q2_tw = vld2q_s32 (p_twiddles);
  1604                 q2_fnkc.val[0] = vrev64q_s32 (q2_fnkc.val[0]);
  1605                 q2_fnkc.val[1] = vrev64q_s32 (q2_fnkc.val[1]);
  1606                 q_fnkc_r = vcombine_s32 (vget_high_s32 (q2_fnkc.val[0]), vget_low_s32 (q2_fnkc.val[0]));
  1607                 q_fnkc_i = vcombine_s32 (vget_high_s32 (q2_fnkc.val[1]), vget_low_s32 (q2_fnkc.val[1]));
  1608                 q_fnkc_i = vnegq_s32 (q_fnkc_i);
  1610                 q_fek_r = vhaddq_s32 (q2_fk.val[0], q_fnkc_r);
  1611                 q_fek_i = vhaddq_s32 (q2_fk.val[1], q_fnkc_i);
  1612                 q_tmp0 = vhsubq_s32 (q2_fk.val[0], q_fnkc_r);
  1613                 q_tmp1 = vhsubq_s32 (q2_fk.val[1], q_fnkc_i);
  1615                 q_fok_r = vqdmulhq_s32 (q_tmp0, q2_tw.val[0]);
  1616                 q_fok_i = vqdmulhq_s32 (q_tmp1, q2_tw.val[0]);
  1617                 q_tmp2 = vqdmulhq_s32 (q_tmp1, q2_tw.val[1]);
  1618                 q_tmp3 = vqdmulhq_s32 (q_tmp0, q2_tw.val[1]);
  1619                 q_fok_r = vaddq_s32 (q_fok_r, q_tmp2);
  1620                 q_fok_i = vsubq_s32 (q_fok_i, q_tmp3);
  1622                 q_dst2_r = vsubq_s32 (q_fek_r, q_fok_r);
  1623                 q_dst2_i = vsubq_s32 (q_fok_i, q_fek_i);
  1624                 q2_dst.val[0] = vaddq_s32 (q_fek_r, q_fok_r);
  1625                 q2_dst.val[1] = vaddq_s32 (q_fek_i, q_fok_i);
  1626                 q_dst2_r = vrev64q_s32 (q_dst2_r);
  1627                 q_dst2_i = vrev64q_s32 (q_dst2_i);
  1628                 q2_dst2.val[0] = vcombine_s32 (vget_high_s32 (q_dst2_r), vget_low_s32 (q_dst2_r));
  1629                 q2_dst2.val[1] = vcombine_s32 (vget_high_s32 (q_dst2_i), vget_low_s32 (q_dst2_i));
  1630                 vst2q_s32 (p_dst, q2_dst);
  1631                 vst2q_s32 (p_dst2, q2_dst2);
  1638             for (k = 1; k <= count ; k += 4)
  1640                 p_src  = (int32_t*) (& (src[k]));
  1641                 p_src2  = (int32_t*) (& (src[ncfft - k - 3]));
  1642                 p_twiddles  = (int32_t*) (& (twiddles[k - 1]));
  1643                 p_dst  = (int32_t*) (& (dst[k]));
  1644                 p_dst2  = (int32_t*) (& (dst[ncfft - k - 3]));
  1646                 q2_fk  = vld2q_s32 (p_src);
  1647                 q2_fnkc = vld2q_s32 (p_src2);
  1648                 q2_tw = vld2q_s32 (p_twiddles);
  1649                 q2_fnkc.val[0] = vrev64q_s32 (q2_fnkc.val[0]);
  1650                 q2_fnkc.val[1] = vrev64q_s32 (q2_fnkc.val[1]);
  1651                 q_fnkc_r = vcombine_s32 (vget_high_s32 (q2_fnkc.val[0]), vget_low_s32 (q2_fnkc.val[0]));
  1652                 q_fnkc_i = vcombine_s32 (vget_high_s32 (q2_fnkc.val[1]), vget_low_s32 (q2_fnkc.val[1]));
  1653                 q_fnkc_i = vnegq_s32 (q_fnkc_i);
  1655                 q_fek_r = vaddq_s32 (q2_fk.val[0], q_fnkc_r);
  1656                 q_fek_i = vaddq_s32 (q2_fk.val[1], q_fnkc_i);
  1657                 q_tmp0 = vsubq_s32 (q2_fk.val[0], q_fnkc_r);
  1658                 q_tmp1 = vsubq_s32 (q2_fk.val[1], q_fnkc_i);
  1660                 q_fok_r = vqdmulhq_s32 (q_tmp0, q2_tw.val[0]);
  1661                 q_fok_i = vqdmulhq_s32 (q_tmp1, q2_tw.val[0]);
  1662                 q_tmp2 = vqdmulhq_s32 (q_tmp1, q2_tw.val[1]);
  1663                 q_tmp3 = vqdmulhq_s32 (q_tmp0, q2_tw.val[1]);
  1664                 q_fok_r = vaddq_s32 (q_fok_r, q_tmp2);
  1665                 q_fok_i = vsubq_s32 (q_fok_i, q_tmp3);
  1667                 q_dst2_r = vsubq_s32 (q_fek_r, q_fok_r);
  1668                 q_dst2_i = vsubq_s32 (q_fok_i, q_fek_i);
  1669                 q2_dst.val[0] = vaddq_s32 (q_fek_r, q_fok_r);
  1670                 q2_dst.val[1] = vaddq_s32 (q_fek_i, q_fok_i);
  1671                 q_dst2_r = vrev64q_s32 (q_dst2_r);
  1672                 q_dst2_i = vrev64q_s32 (q_dst2_i);
  1673                 q2_dst2.val[0] = vcombine_s32 (vget_high_s32 (q_dst2_r), vget_low_s32 (q_dst2_r));
  1674                 q2_dst2.val[1] = vcombine_s32 (vget_high_s32 (q_dst2_i), vget_low_s32 (q_dst2_i));
  1675                 vst2q_s32 (p_dst, q2_dst);
  1676                 vst2q_s32 (p_dst2, q2_dst2);
  1684         for (k = 1; k <= ncfft / 2; k++)
  1687             fnkc.
r = src[ncfft - k].
r;
  1688             fnkc.
i = -src[ncfft - k].
i;
  1695             fek.
r = fk.
r + fnkc.
r;
  1696             fek.
i = fk.
i + fnkc.
i;
  1698             tmp.
r = fk.
r - fnkc.
r;
  1699             tmp.
i = fk.
i - fnkc.
i;
  1704             dst[k].
r = fek.
r + fok.
r;
  1705             dst[k].
i = fek.
i + fok.
i;
  1707             dst[ncfft - k].
r = fek.
r - fok.
r;
  1708             dst[ncfft - k].
i = fok.
i - fek.
i;
  1762                 ne10_fft4_backward_int32_scaled (fout, fin);
  1765                 ne10_fft8_backward_int32_scaled (fout, fin);
  1768                 ne10_fft16_backward_int32_scaled_neon (fout, fin, cfg->
twiddles);
  1780                 ne10_fft4_forward_int32_scaled (fout, fin);
  1783                 ne10_fft8_forward_int32_scaled (fout, fin);
  1786                 ne10_fft16_forward_int32_scaled_neon (fout, fin, cfg->
twiddles);
  1801                 ne10_fft4_backward_int32_unscaled (fout, fin);
  1804                 ne10_fft8_backward_int32_unscaled (fout, fin);
  1807                 ne10_fft16_backward_int32_unscaled_neon (fout, fin, cfg->
twiddles);
  1819                 ne10_fft4_forward_int32_unscaled (fout, fin);
  1822                 ne10_fft8_forward_int32_unscaled (fout, fin);
  1825                 ne10_fft16_forward_int32_unscaled_neon (fout, fin, cfg->
twiddles);
  1851     c2c_state.
buffer = tmpbuf2;
  1854     ne10_fft_split_r2c_1d_int32_neon (fout, tmpbuf1,  cfg->
super_twiddles, cfg->
ncfft, scaled_flag);
  1873     c2c_state.
buffer = tmpbuf2;
  1875     ne10_fft_split_c2r_1d_int32_neon (tmpbuf1, fin, cfg->
super_twiddles, cfg->
ncfft, scaled_flag);
 #define NE10_FFT_ALG_DEFAULT
 
void ne10_fft_c2c_1d_int32_neon(ne10_fft_cpx_int32_t *fout, ne10_fft_cpx_int32_t *fin, ne10_fft_cfg_int32_t cfg, ne10_int32_t inverse_fft, ne10_int32_t scaled_flag)
Specific implementation of ne10_fft_c2c_1d_int32 using NEON SIMD capabilities. 
 
#define RADIX8x4_LS_02_SCALED
 
ne10_fft_cpx_int32_t * twiddles
 
void ne10_mixed_radix_fft_backward_int32_scaled_neon(ne10_fft_cpx_int32_t *Fout, ne10_fft_cpx_int32_t *fin, ne10_int32_t *factors, ne10_fft_cpx_int32_t *twiddles, ne10_fft_cpx_int32_t *buffer) asm("ne10_mixed_radix_fft_backward_int32_scaled_neon")
 
void ne10_fft_c2r_1d_int32_neon(ne10_int32_t *fout, ne10_fft_cpx_int32_t *fin, ne10_fft_r2c_cfg_int32_t cfg, ne10_int32_t scaled_flag)
Specific implementation of ne10_fft_c2r_1d_int32 using NEON SIMD capabilities. 
 
#define RADIX4x4_WITH_TW_START
 
void ne10_mixed_radix_fft_forward_int32_unscaled_neon(ne10_fft_cpx_int32_t *Fout, ne10_fft_cpx_int32_t *fin, ne10_int32_t *factors, ne10_fft_cpx_int32_t *twiddles, ne10_fft_cpx_int32_t *buffer) asm("ne10_mixed_radix_fft_forward_int32_unscaled_neon")
 
#define RADIX4x4_WITH_TW_LS_02
 
#define RADIX8x4_INV_S357
 
#define RADIX4x4_WITHOUT_TW_S0
 
#define NE10_F2I32_SAMPPROD
 
#define RADIX4x4_WITH_TW_LOAD
 
#define RADIX8x4_FS_S0_SCALED
 
#define RADIX4x4_WITHOUT_TW_START
 
#define NE10_F2I32_FIXDIV(c, div)
 
void ne10_fft_r2c_1d_int32_neon(ne10_fft_cpx_int32_t *fout, ne10_int32_t *fin, ne10_fft_r2c_cfg_int32_t cfg, ne10_int32_t scaled_flag)
Specific implementation of ne10_fft_r2c_1d_int32 using NEON SIMD capabilities. 
 
void ne10_fft_c2c_1d_int32_c(ne10_fft_cpx_int32_t *fout, ne10_fft_cpx_int32_t *fin, ne10_fft_cfg_int32_t cfg, ne10_int32_t inverse_fft, ne10_int32_t scaled_flag)
Specific implementation of ne10_fft_c2c_1d_int32 using plain C. 
 
void ne10_mixed_radix_fft_backward_int32_unscaled_neon(ne10_fft_cpx_int32_t *Fout, ne10_fft_cpx_int32_t *fin, ne10_int32_t *factors, ne10_fft_cpx_int32_t *twiddles, ne10_fft_cpx_int32_t *buffer) asm("ne10_mixed_radix_fft_backward_int32_unscaled_neon")
 
Structure for the 32-bit fixed point FFT function. 
 
#define RADIX4x4_WITHOUT_TW_S0_SCALED
 
#define RADIX4x4_WITHOUT_TW_STORE
 
#define ne10_mixed_radix_fft_forward_int32_neon(scaled)
 
#define RADIX4x4_WITH_TW_STORE
 
void ne10_mixed_radix_generic_butterfly_inverse_int32_neon(ne10_fft_cpx_int32_t *Fout, const ne10_fft_cpx_int32_t *Fin, const ne10_int32_t *factors, const ne10_fft_cpx_int32_t *twiddles, ne10_fft_cpx_int32_t *buffer, const ne10_int32_t scaled_flag)
 
#define RADIX4x4_WITHOUT_TW_LOAD
 
#define FFT16_LS_02_SCALED
 
ne10_fft_cpx_int32_t * twiddles
 
void ne10_mixed_radix_generic_butterfly_int32_neon(ne10_fft_cpx_int32_t *Fout, const ne10_fft_cpx_int32_t *Fin, const ne10_int32_t *factors, const ne10_fft_cpx_int32_t *twiddles, ne10_fft_cpx_int32_t *buffer, const ne10_int32_t scaled_flag)
 
ne10_fft_cpx_int32_t * buffer
 
ne10_fft_cpx_int32_t * buffer
 
#define RADIX4x4_WITH_TW_LS_02_SCALED
 
#define RADIX8x4_FWD_S357
 
#define RADIX4x4_WITH_TW_S1_FWD
 
#define RADIX4x4_WITH_TW_S1_INV
 
#define ne10_mixed_radix_fft_backward_int32_neon(scaled)
 
ne10_fft_cpx_int32_t * super_twiddles
 
void ne10_mixed_radix_fft_forward_int32_scaled_neon(ne10_fft_cpx_int32_t *Fout, ne10_fft_cpx_int32_t *fin, ne10_int32_t *factors, ne10_fft_cpx_int32_t *twiddles, ne10_fft_cpx_int32_t *buffer) asm("ne10_mixed_radix_fft_forward_int32_scaled_neon")