46     s2_r = Fin[0].
r - Fin[2].
r;
    47     s2_i = Fin[0].
i - Fin[2].
i;
    49     tmp_r = Fin[0].
r + Fin[2].
r;
    50     tmp_i = Fin[0].
i + Fin[2].
i;
    52     s0_r = Fin[1].
r + Fin[3].
r;
    53     s0_i = Fin[1].
i + Fin[3].
i;
    55     s1_r = Fin[1].
r - Fin[3].
r;
    56     s1_i = Fin[1].
i - Fin[3].
i;
    57     Fout[2].
r = tmp_r - s0_r;
    58     Fout[2].
i = tmp_i - s0_i;
    59     Fout[0].
r = tmp_r + s0_r;
    60     Fout[0].
i = tmp_i + s0_i;
    62     Fout[1].
r = s2_r + s1_i;
    63     Fout[1].
i = s2_i - s1_r;
    64     Fout[3].
r = s2_r - s1_i;
    65     Fout[3].
i = s2_i + s1_r;
    75     s2_r = Fin[0].
r - Fin[2].
r;
    76     s2_i = Fin[0].
i - Fin[2].
i;
    78     tmp_r = Fin[0].
r + Fin[2].
r;
    79     tmp_i = Fin[0].
i + Fin[2].
i;
    81     s0_r = Fin[1].
r + Fin[3].
r;
    82     s0_i = Fin[1].
i + Fin[3].
i;
    84     s1_r = Fin[1].
r - Fin[3].
r;
    85     s1_i = Fin[1].
i - Fin[3].
i;
    87     Fout[2].
r = tmp_r - s0_r;
    88     Fout[2].
i = tmp_i - s0_i;
    89     Fout[0].
r = tmp_r + s0_r;
    90     Fout[0].
i = tmp_i + s0_i;
    92     Fout[1].
r = s2_r - s1_i;
    93     Fout[1].
i = s2_i + s1_r;
    94     Fout[3].
r = s2_r + s1_i;
    95     Fout[3].
i = s2_i - s1_r;
   104     s2_r = (Fin[0].
r - Fin[2].
r) >> 2;
   105     s2_i = (Fin[0].
i - Fin[2].
i) >> 2;
   106     tmp_r = (Fin[0].
r + Fin[2].
r) >> 2;
   107     tmp_i = (Fin[0].
i + Fin[2].
i) >> 2;
   109     s0_r = (Fin[1].
r + Fin[3].
r) >> 2;
   110     s0_i = (Fin[1].
i + Fin[3].
i) >> 2;
   111     s1_r = (Fin[1].
r - Fin[3].
r) >> 2;
   112     s1_i = (Fin[1].
i - Fin[3].
i) >> 2;
   114     Fout[2].
r = tmp_r - s0_r;
   115     Fout[2].
i = tmp_i - s0_i;
   116     Fout[0].
r = tmp_r + s0_r;
   117     Fout[0].
i = tmp_i + s0_i;
   119     Fout[1].
r = s2_r + s1_i;
   120     Fout[1].
i = s2_i - s1_r;
   121     Fout[3].
r = s2_r - s1_i;
   122     Fout[3].
i = s2_i + s1_r;
   132     s2_r = (Fin[0].
r - Fin[2].
r) >> 2;
   133     s2_i = (Fin[0].
i - Fin[2].
i) >> 2;
   134     tmp_r = (Fin[0].
r + Fin[2].
r) >> 2;
   135     tmp_i = (Fin[0].
i + Fin[2].
i) >> 2;
   137     s0_r = (Fin[1].
r + Fin[3].
r) >> 2;
   138     s0_i = (Fin[1].
i + Fin[3].
i) >> 2;
   139     s1_r = (Fin[1].
r - Fin[3].
r) >> 2;
   140     s1_i = (Fin[1].
i - Fin[3].
i) >> 2;
   142     Fout[2].
r = tmp_r - s0_r;
   143     Fout[2].
i = tmp_i - s0_i;
   144     Fout[0].
r = tmp_r + s0_r;
   145     Fout[0].
i = tmp_i + s0_i;
   147     Fout[1].
r = s2_r - s1_i;
   148     Fout[1].
i = s2_i + s1_r;
   149     Fout[3].
r = s2_r + s1_i;
   150     Fout[3].
i = s2_i - s1_r;
   156     ne10_int32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
   157     ne10_int32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
   160     s0_r = Fin[0].
r + Fin[4].
r;
   161     s0_i = Fin[0].
i + Fin[4].
i;
   162     s1_r = Fin[0].
r - Fin[4].
r;
   163     s1_i = Fin[0].
i - Fin[4].
i;
   164     s2_r = Fin[1].
r + Fin[5].
r;
   165     s2_i = Fin[1].
i + Fin[5].
i;
   166     s3_r = Fin[1].
r - Fin[5].
r;
   167     s3_i = Fin[1].
i - Fin[5].
i;
   168     s4_r = Fin[2].
r + Fin[6].
r;
   169     s4_i = Fin[2].
i + Fin[6].
i;
   170     s5_r = Fin[2].
r - Fin[6].
r;
   171     s5_i = Fin[2].
i - Fin[6].
i;
   172     s6_r = Fin[3].
r + Fin[7].
r;
   173     s6_i = Fin[3].
i + Fin[7].
i;
   174     s7_r = Fin[3].
r - Fin[7].
r;
   175     s7_i = Fin[3].
i - Fin[7].
i;
   185     Fout[0].
r = t1_r + t2_r;
   186     Fout[0].
i = t1_i + t2_i;
   187     Fout[4].
r = t1_r - t2_r;
   188     Fout[4].
i = t1_i - t2_i;
   189     Fout[2].
r = t0_r + t3_i;
   190     Fout[2].
i = t0_i - t3_r;
   191     Fout[6].
r = t0_r - t3_i;
   192     Fout[6].
i = t0_i + t3_r;
   207     Fout[1].
r = t1_r + t2_r;
   208     Fout[1].
i = t1_i + t2_i;
   209     Fout[5].
r = t1_r - t2_r;
   210     Fout[5].
i = t1_i - t2_i;
   211     Fout[3].
r = t0_r + t3_i;
   212     Fout[3].
i = t0_i - t3_r;
   213     Fout[7].
r = t0_r - t3_i;
   214     Fout[7].
i = t0_i + t3_r;
   221     ne10_int32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
   222     ne10_int32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
   225     s0_r = Fin[0].
r + Fin[4].
r;
   226     s0_i = Fin[0].
i + Fin[4].
i;
   227     s1_r = Fin[0].
r - Fin[4].
r;
   228     s1_i = Fin[0].
i - Fin[4].
i;
   229     s2_r = Fin[1].
r + Fin[5].
r;
   230     s2_i = Fin[1].
i + Fin[5].
i;
   231     s3_r = Fin[1].
r - Fin[5].
r;
   232     s3_i = Fin[1].
i - Fin[5].
i;
   233     s4_r = Fin[2].
r + Fin[6].
r;
   234     s4_i = Fin[2].
i + Fin[6].
i;
   235     s5_r = Fin[2].
r - Fin[6].
r;
   236     s5_i = Fin[2].
i - Fin[6].
i;
   237     s6_r = Fin[3].
r + Fin[7].
r;
   238     s6_i = Fin[3].
i + Fin[7].
i;
   239     s7_r = Fin[3].
r - Fin[7].
r;
   240     s7_i = Fin[3].
i - Fin[7].
i;
   250     Fout[0].
r = t1_r + t2_r;
   251     Fout[0].
i = t1_i + t2_i;
   252     Fout[4].
r = t1_r - t2_r;
   253     Fout[4].
i = t1_i - t2_i;
   254     Fout[2].
r = t0_r - t3_i;
   255     Fout[2].
i = t0_i + t3_r;
   256     Fout[6].
r = t0_r + t3_i;
   257     Fout[6].
i = t0_i - t3_r;
   272     Fout[1].
r = t1_r + t2_r;
   273     Fout[1].
i = t1_i + t2_i;
   274     Fout[5].
r = t1_r - t2_r;
   275     Fout[5].
i = t1_i - t2_i;
   276     Fout[3].
r = t0_r - t3_i;
   277     Fout[3].
i = t0_i + t3_r;
   278     Fout[7].
r = t0_r + t3_i;
   279     Fout[7].
i = t0_i - t3_r;
   285     ne10_int32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
   286     ne10_int32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
   289     s0_r = (Fin[0].
r + Fin[4].
r) >> 3;
   290     s0_i = (Fin[0].
i + Fin[4].
i) >> 3;
   291     s1_r = (Fin[0].
r - Fin[4].
r) >> 3;
   292     s1_i = (Fin[0].
i - Fin[4].
i) >> 3;
   293     s2_r = (Fin[1].
r + Fin[5].
r) >> 3;
   294     s2_i = (Fin[1].
i + Fin[5].
i) >> 3;
   295     s3_r = (Fin[1].
r - Fin[5].
r) >> 3;
   296     s3_i = (Fin[1].
i - Fin[5].
i) >> 3;
   297     s4_r = (Fin[2].
r + Fin[6].
r) >> 3;
   298     s4_i = (Fin[2].
i + Fin[6].
i) >> 3;
   299     s5_r = (Fin[2].
r - Fin[6].
r) >> 3;
   300     s5_i = (Fin[2].
i - Fin[6].
i) >> 3;
   301     s6_r = (Fin[3].
r + Fin[7].
r) >> 3;
   302     s6_i = (Fin[3].
i + Fin[7].
i) >> 3;
   303     s7_r = (Fin[3].
r - Fin[7].
r) >> 3;
   304     s7_i = (Fin[3].
i - Fin[7].
i) >> 3;
   314     Fout[0].
r = t1_r + t2_r;
   315     Fout[0].
i = t1_i + t2_i;
   316     Fout[4].
r = t1_r - t2_r;
   317     Fout[4].
i = t1_i - t2_i;
   318     Fout[2].
r = t0_r + t3_i;
   319     Fout[2].
i = t0_i - t3_r;
   320     Fout[6].
r = t0_r - t3_i;
   321     Fout[6].
i = t0_i + t3_r;
   336     Fout[1].
r = t1_r + t2_r;
   337     Fout[1].
i = t1_i + t2_i;
   338     Fout[5].
r = t1_r - t2_r;
   339     Fout[5].
i = t1_i - t2_i;
   340     Fout[3].
r = t0_r + t3_i;
   341     Fout[3].
i = t0_i - t3_r;
   342     Fout[7].
r = t0_r - t3_i;
   343     Fout[7].
i = t0_i + t3_r;
   350     ne10_int32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
   351     ne10_int32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
   354     s0_r = (Fin[0].
r + Fin[4].
r) >> 3;
   355     s0_i = (Fin[0].
i + Fin[4].
i) >> 3;
   356     s1_r = (Fin[0].
r - Fin[4].
r) >> 3;
   357     s1_i = (Fin[0].
i - Fin[4].
i) >> 3;
   358     s2_r = (Fin[1].
r + Fin[5].
r) >> 3;
   359     s2_i = (Fin[1].
i + Fin[5].
i) >> 3;
   360     s3_r = (Fin[1].
r - Fin[5].
r) >> 3;
   361     s3_i = (Fin[1].
i - Fin[5].
i) >> 3;
   362     s4_r = (Fin[2].
r + Fin[6].
r) >> 3;
   363     s4_i = (Fin[2].
i + Fin[6].
i) >> 3;
   364     s5_r = (Fin[2].
r - Fin[6].
r) >> 3;
   365     s5_i = (Fin[2].
i - Fin[6].
i) >> 3;
   366     s6_r = (Fin[3].
r + Fin[7].
r) >> 3;
   367     s6_i = (Fin[3].
i + Fin[7].
i) >> 3;
   368     s7_r = (Fin[3].
r - Fin[7].
r) >> 3;
   369     s7_i = (Fin[3].
i - Fin[7].
i) >> 3;
   379     Fout[0].
r = t1_r + t2_r;
   380     Fout[0].
i = t1_i + t2_i;
   381     Fout[4].
r = t1_r - t2_r;
   382     Fout[4].
i = t1_i - t2_i;
   383     Fout[2].
r = t0_r - t3_i;
   384     Fout[2].
i = t0_i + t3_r;
   385     Fout[6].
r = t0_r + t3_i;
   386     Fout[6].
i = t0_i - t3_r;
   401     Fout[1].
r = t1_r + t2_r;
   402     Fout[1].
i = t1_i + t2_i;
   403     Fout[5].
r = t1_r - t2_r;
   404     Fout[5].
i = t1_i - t2_i;
   405     Fout[3].
r = t0_r - t3_i;
   406     Fout[3].
i = t0_i + t3_r;
   407     Fout[7].
r = t0_r + t3_i;
   408     Fout[7].
i = t0_i - t3_r;
   418     int32_t *p_src0, *p_src4, *p_src8, *p_src12;
   419     int32x4x2_t q2_in_0123, q2_in_4567, q2_in_89ab, q2_in_cdef;
   420     int32x4_t q_t0_r,  q_t0_i, q_t1_r,  q_t1_i, q_t2_r,  q_t2_i, q_t3_r, q_t3_i;
   421     int32x4_t q_out_r048c,  q_out_i048c, q_out_r159d,  q_out_i159d;
   422     int32x4_t q_out_r26ae,  q_out_i26ae, q_out_r37bf,  q_out_i37bf;
   423     p_src0 = (int32_t*) (& (Fin[0]));
   424     p_src4 = (int32_t*) (& (Fin[4]));
   425     p_src8 = (int32_t*) (& (Fin[8]));
   426     p_src12 = (int32_t*) (& (Fin[12]));
   427     q2_in_0123 = vld2q_s32 (p_src0);
   428     q2_in_4567 = vld2q_s32 (p_src4);
   429     q2_in_89ab = vld2q_s32 (p_src8);
   430     q2_in_cdef = vld2q_s32 (p_src12);
   432     q_t2_r = vsubq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
   433     q_t2_i = vsubq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
   434     q_t3_r = vaddq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
   435     q_t3_i = vaddq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
   437     q_t0_r = vaddq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
   438     q_t0_i = vaddq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
   439     q_t1_r = vsubq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
   440     q_t1_i = vsubq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
   442     q_out_r26ae = vsubq_s32 (q_t3_r, q_t0_r);
   443     q_out_i26ae = vsubq_s32 (q_t3_i, q_t0_i);
   444     q_out_r048c = vaddq_s32 (q_t3_r, q_t0_r);
   445     q_out_i048c = vaddq_s32 (q_t3_i, q_t0_i);
   446     q_out_r159d = vaddq_s32 (q_t2_r, q_t1_i);
   447     q_out_i159d = vsubq_s32 (q_t2_i, q_t1_r);
   448     q_out_r37bf = vsubq_s32 (q_t2_r, q_t1_i);
   449     q_out_i37bf = vaddq_s32 (q_t2_i, q_t1_r);
   452     int32_t *p_dst0, *p_dst1, *p_dst2, *p_dst3;
   453     int32_t *p_tw1, *p_tw2, *p_tw3;
   454     int32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i;
   455     int32x4_t q_s3_r, q_s3_i, q_s4_r, q_s4_i, q_s5_r, q_s5_i;
   456     int32x4x2_t q2_tmp_0, q2_tmp_1, q2_tmp_2, q2_tmp_3;
   457     int32x4_t q_in_r0123, q_in_r4567, q_in_r89ab, q_in_rcdef;
   458     int32x4_t q_in_i0123, q_in_i4567, q_in_i89ab, q_in_icdef;
   459     int32x4x2_t q2_tw1, q2_tw2, q2_tw3;
   460     int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3, q_tmp4, q_tmp5;
   461     int32x4x2_t q2_out_0123, q2_out_4567, q2_out_89ab, q2_out_cdef;
   465     p_dst0 = (int32_t*) (&Fout[0]);
   466     p_dst1 = (int32_t*) (&Fout[4]);
   467     p_dst2 = (int32_t*) (&Fout[8]);
   468     p_dst3 = (int32_t*) (&Fout[12]);
   469     p_tw1 = (int32_t*) tw1;
   470     p_tw2 = (int32_t*) tw2;
   471     p_tw3 = (int32_t*) tw3;
   472     q2_tmp_0 = vzipq_s32 (q_out_r048c, q_out_r159d);
   473     q2_tmp_1 = vzipq_s32 (q_out_i048c, q_out_i159d);
   474     q2_tmp_2 = vzipq_s32 (q_out_r26ae, q_out_r37bf);
   475     q2_tmp_3 = vzipq_s32 (q_out_i26ae, q_out_i37bf);
   476     q_in_r0123 = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[0]), vget_low_s32 (q2_tmp_2.val[0]));
   477     q_in_i0123 = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[0]), vget_low_s32 (q2_tmp_3.val[0]));
   478     q_in_r4567 = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[0]), vget_high_s32 (q2_tmp_2.val[0]));
   479     q_in_i4567 = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[0]), vget_high_s32 (q2_tmp_3.val[0]));
   480     q_in_r89ab = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[1]), vget_low_s32 (q2_tmp_2.val[1]));
   481     q_in_i89ab = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[1]), vget_low_s32 (q2_tmp_3.val[1]));
   482     q_in_rcdef = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[1]), vget_high_s32 (q2_tmp_2.val[1]));
   483     q_in_icdef = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[1]), vget_high_s32 (q2_tmp_3.val[1]));
   484     q2_tw1 = vld2q_s32 (p_tw1);
   485     q2_tw2 = vld2q_s32 (p_tw2);
   486     q2_tw3 = vld2q_s32 (p_tw3);
   488     q_s0_r = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[0]);
   489     q_s0_i = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[1]);
   490     q_s1_r = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[0]);
   491     q_s1_i = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[1]);
   492     q_s2_r = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[0]);
   493     q_s2_i = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[1]);
   494     q_tmp0 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[1]);
   495     q_tmp1 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[0]);
   496     q_tmp2 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[1]);
   497     q_tmp3 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[0]);
   498     q_tmp4 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[1]);
   499     q_tmp5 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[0]);
   500     q_s0_r = vsubq_s32 (q_s0_r, q_tmp0);
   501     q_s0_i = vaddq_s32 (q_s0_i, q_tmp1);
   502     q_s1_r = vsubq_s32 (q_s1_r, q_tmp2);
   503     q_s1_i = vaddq_s32 (q_s1_i, q_tmp3);
   504     q_s2_r = vsubq_s32 (q_s2_r, q_tmp4);
   505     q_s2_i = vaddq_s32 (q_s2_i, q_tmp5);
   507     q_s5_r = vsubq_s32 (q_in_r0123, q_s1_r);
   508     q_s5_i = vsubq_s32 (q_in_i0123, q_s1_i);
   509     q2_out_0123.val[0] = vaddq_s32 (q_in_r0123, q_s1_r);
   510     q2_out_0123.val[1] = vaddq_s32 (q_in_i0123, q_s1_i);
   512     q_s3_r = vaddq_s32 (q_s0_r, q_s2_r);
   513     q_s3_i = vaddq_s32 (q_s0_i, q_s2_i);
   514     q_s4_r = vsubq_s32 (q_s0_r, q_s2_r);
   515     q_s4_i = vsubq_s32 (q_s0_i, q_s2_i);
   517     q2_out_89ab.val[0] = vsubq_s32 (q2_out_0123.val[0], q_s3_r);
   518     q2_out_89ab.val[1] = vsubq_s32 (q2_out_0123.val[1], q_s3_i);
   519     q2_out_0123.val[0] = vaddq_s32 (q2_out_0123.val[0], q_s3_r);
   520     q2_out_0123.val[1] = vaddq_s32 (q2_out_0123.val[1], q_s3_i);
   522     q2_out_4567.val[0] = vaddq_s32 (q_s5_r, q_s4_i);
   523     q2_out_4567.val[1] = vsubq_s32 (q_s5_i, q_s4_r);
   524     q2_out_cdef.val[0] = vsubq_s32 (q_s5_r, q_s4_i);
   525     q2_out_cdef.val[1] = vaddq_s32 (q_s5_i, q_s4_r);
   527     vst2q_s32 (p_dst0, q2_out_0123);
   528     vst2q_s32 (p_dst1, q2_out_4567);
   529     vst2q_s32 (p_dst2, q2_out_89ab);
   530     vst2q_s32 (p_dst3, q2_out_cdef);
   540     int32_t *p_src0, *p_src4, *p_src8, *p_src12;
   541     int32x4x2_t q2_in_0123, q2_in_4567, q2_in_89ab, q2_in_cdef;
   542     int32x4_t q_t0_r,  q_t0_i, q_t1_r,  q_t1_i, q_t2_r,  q_t2_i, q_t3_r, q_t3_i;
   543     int32x4_t q_out_r048c,  q_out_i048c, q_out_r159d,  q_out_i159d;
   544     int32x4_t q_out_r26ae,  q_out_i26ae, q_out_r37bf,  q_out_i37bf;
   545     p_src0 = (int32_t*) (& (Fin[0]));
   546     p_src4 = (int32_t*) (& (Fin[4]));
   547     p_src8 = (int32_t*) (& (Fin[8]));
   548     p_src12 = (int32_t*) (& (Fin[12]));
   549     q2_in_0123 = vld2q_s32 (p_src0);
   550     q2_in_4567 = vld2q_s32 (p_src4);
   551     q2_in_89ab = vld2q_s32 (p_src8);
   552     q2_in_cdef = vld2q_s32 (p_src12);
   554     q_t2_r = vsubq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
   555     q_t2_i = vsubq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
   556     q_t3_r = vaddq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
   557     q_t3_i = vaddq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
   559     q_t0_r = vaddq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
   560     q_t0_i = vaddq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
   561     q_t1_r = vsubq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
   562     q_t1_i = vsubq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
   564     q_out_r26ae = vsubq_s32 (q_t3_r, q_t0_r);
   565     q_out_i26ae = vsubq_s32 (q_t3_i, q_t0_i);
   566     q_out_r048c = vaddq_s32 (q_t3_r, q_t0_r);
   567     q_out_i048c = vaddq_s32 (q_t3_i, q_t0_i);
   568     q_out_r159d = vsubq_s32 (q_t2_r, q_t1_i);
   569     q_out_i159d = vaddq_s32 (q_t2_i, q_t1_r);
   570     q_out_r37bf = vaddq_s32 (q_t2_r, q_t1_i);
   571     q_out_i37bf = vsubq_s32 (q_t2_i, q_t1_r);
   574     int32_t *p_dst0, *p_dst1, *p_dst2, *p_dst3;
   575     int32_t *p_tw1, *p_tw2, *p_tw3;
   576     int32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i;
   577     int32x4_t q_s3_r, q_s3_i, q_s4_r, q_s4_i, q_s5_r, q_s5_i;
   578     int32x4x2_t q2_tmp_0, q2_tmp_1, q2_tmp_2, q2_tmp_3;
   579     int32x4_t q_in_r0123, q_in_r4567, q_in_r89ab, q_in_rcdef;
   580     int32x4_t q_in_i0123, q_in_i4567, q_in_i89ab, q_in_icdef;
   581     int32x4x2_t q2_tw1, q2_tw2, q2_tw3;
   582     int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3, q_tmp4, q_tmp5;
   583     int32x4x2_t q2_out_0123, q2_out_4567, q2_out_89ab, q2_out_cdef;
   587     p_dst0 = (int32_t*) (&Fout[0]);
   588     p_dst1 = (int32_t*) (&Fout[4]);
   589     p_dst2 = (int32_t*) (&Fout[8]);
   590     p_dst3 = (int32_t*) (&Fout[12]);
   591     p_tw1 = (int32_t*) tw1;
   592     p_tw2 = (int32_t*) tw2;
   593     p_tw3 = (int32_t*) tw3;
   594     q2_tmp_0 = vzipq_s32 (q_out_r048c, q_out_r159d);
   595     q2_tmp_1 = vzipq_s32 (q_out_i048c, q_out_i159d);
   596     q2_tmp_2 = vzipq_s32 (q_out_r26ae, q_out_r37bf);
   597     q2_tmp_3 = vzipq_s32 (q_out_i26ae, q_out_i37bf);
   598     q_in_r0123 = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[0]), vget_low_s32 (q2_tmp_2.val[0]));
   599     q_in_i0123 = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[0]), vget_low_s32 (q2_tmp_3.val[0]));
   600     q_in_r4567 = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[0]), vget_high_s32 (q2_tmp_2.val[0]));
   601     q_in_i4567 = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[0]), vget_high_s32 (q2_tmp_3.val[0]));
   602     q_in_r89ab = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[1]), vget_low_s32 (q2_tmp_2.val[1]));
   603     q_in_i89ab = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[1]), vget_low_s32 (q2_tmp_3.val[1]));
   604     q_in_rcdef = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[1]), vget_high_s32 (q2_tmp_2.val[1]));
   605     q_in_icdef = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[1]), vget_high_s32 (q2_tmp_3.val[1]));
   606     q2_tw1 = vld2q_s32 (p_tw1);
   607     q2_tw2 = vld2q_s32 (p_tw2);
   608     q2_tw3 = vld2q_s32 (p_tw3);
   610     q_s0_r = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[0]);
   611     q_s0_i = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[0]);
   612     q_s1_r = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[0]);
   613     q_s1_i = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[0]);
   614     q_s2_r = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[0]);
   615     q_s2_i = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[0]);
   616     q_tmp0 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[1]);
   617     q_tmp1 = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[1]);
   618     q_tmp2 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[1]);
   619     q_tmp3 = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[1]);
   620     q_tmp4 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[1]);
   621     q_tmp5 = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[1]);
   622     q_s0_r = vaddq_s32 (q_s0_r, q_tmp0);
   623     q_s0_i = vsubq_s32 (q_s0_i, q_tmp1);
   624     q_s1_r = vaddq_s32 (q_s1_r, q_tmp2);
   625     q_s1_i = vsubq_s32 (q_s1_i, q_tmp3);
   626     q_s2_r = vaddq_s32 (q_s2_r, q_tmp4);
   627     q_s2_i = vsubq_s32 (q_s2_i, q_tmp5);
   629     q_s5_r = vsubq_s32 (q_in_r0123, q_s1_r);
   630     q_s5_i = vsubq_s32 (q_in_i0123, q_s1_i);
   631     q2_out_0123.val[0] = vaddq_s32 (q_in_r0123, q_s1_r);
   632     q2_out_0123.val[1] = vaddq_s32 (q_in_i0123, q_s1_i);
   634     q_s3_r = vaddq_s32 (q_s0_r, q_s2_r);
   635     q_s3_i = vaddq_s32 (q_s0_i, q_s2_i);
   636     q_s4_r = vsubq_s32 (q_s0_r, q_s2_r);
   637     q_s4_i = vsubq_s32 (q_s0_i, q_s2_i);
   639     q2_out_89ab.val[0] = vsubq_s32 (q2_out_0123.val[0], q_s3_r);
   640     q2_out_89ab.val[1] = vsubq_s32 (q2_out_0123.val[1], q_s3_i);
   641     q2_out_0123.val[0] = vaddq_s32 (q2_out_0123.val[0], q_s3_r);
   642     q2_out_0123.val[1] = vaddq_s32 (q2_out_0123.val[1], q_s3_i);
   644     q2_out_4567.val[0] = vsubq_s32 (q_s5_r, q_s4_i);
   645     q2_out_4567.val[1] = vaddq_s32 (q_s5_i, q_s4_r);
   646     q2_out_cdef.val[0] = vaddq_s32 (q_s5_r, q_s4_i);
   647     q2_out_cdef.val[1] = vsubq_s32 (q_s5_i, q_s4_r);
   649     vst2q_s32 (p_dst0, q2_out_0123);
   650     vst2q_s32 (p_dst1, q2_out_4567);
   651     vst2q_s32 (p_dst2, q2_out_89ab);
   652     vst2q_s32 (p_dst3, q2_out_cdef);
   662     int32_t *p_src0, *p_src4, *p_src8, *p_src12;
   663     int32x4x2_t q2_in_0123, q2_in_4567, q2_in_89ab, q2_in_cdef;
   664     int32x4_t q_t0_r,  q_t0_i, q_t1_r,  q_t1_i, q_t2_r,  q_t2_i, q_t3_r, q_t3_i;
   665     int32x4_t q_out_r048c,  q_out_i048c, q_out_r159d,  q_out_i159d;
   666     int32x4_t q_out_r26ae,  q_out_i26ae, q_out_r37bf,  q_out_i37bf;
   667     p_src0 = (int32_t*) (& (Fin[0]));
   668     p_src4 = (int32_t*) (& (Fin[4]));
   669     p_src8 = (int32_t*) (& (Fin[8]));
   670     p_src12 = (int32_t*) (& (Fin[12]));
   671     q2_in_0123 = vld2q_s32 (p_src0);
   672     q2_in_4567 = vld2q_s32 (p_src4);
   673     q2_in_89ab = vld2q_s32 (p_src8);
   674     q2_in_cdef = vld2q_s32 (p_src12);
   676     q_t2_r = vhsubq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
   677     q_t2_i = vhsubq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
   678     q_t3_r = vhaddq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
   679     q_t3_i = vhaddq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
   681     q_t0_r = vhaddq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
   682     q_t0_i = vhaddq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
   683     q_t1_r = vhsubq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
   684     q_t1_i = vhsubq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
   686     q_out_r26ae = vhsubq_s32 (q_t3_r, q_t0_r);
   687     q_out_i26ae = vhsubq_s32 (q_t3_i, q_t0_i);
   688     q_out_r048c = vhaddq_s32 (q_t3_r, q_t0_r);
   689     q_out_i048c = vhaddq_s32 (q_t3_i, q_t0_i);
   690     q_out_r159d = vhaddq_s32 (q_t2_r, q_t1_i);
   691     q_out_i159d = vhsubq_s32 (q_t2_i, q_t1_r);
   692     q_out_r37bf = vhsubq_s32 (q_t2_r, q_t1_i);
   693     q_out_i37bf = vhaddq_s32 (q_t2_i, q_t1_r);
   697     int32_t *p_dst0, *p_dst1, *p_dst2, *p_dst3;
   698     int32_t *p_tw1, *p_tw2, *p_tw3;
   699     int32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i;
   700     int32x4_t q_s3_r, q_s3_i, q_s4_r, q_s4_i, q_s5_r, q_s5_i;
   701     int32x4x2_t q2_tmp_0, q2_tmp_1, q2_tmp_2, q2_tmp_3;
   702     int32x4_t q_in_r0123, q_in_r4567, q_in_r89ab, q_in_rcdef;
   703     int32x4_t q_in_i0123, q_in_i4567, q_in_i89ab, q_in_icdef;
   704     int32x4x2_t q2_tw1, q2_tw2, q2_tw3;
   705     int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3, q_tmp4, q_tmp5;
   706     int32x4x2_t q2_out_0123, q2_out_4567, q2_out_89ab, q2_out_cdef;
   710     p_dst0 = (int32_t*) (&Fout[0]);
   711     p_dst1 = (int32_t*) (&Fout[4]);
   712     p_dst2 = (int32_t*) (&Fout[8]);
   713     p_dst3 = (int32_t*) (&Fout[12]);
   714     p_tw1 = (int32_t*) tw1;
   715     p_tw2 = (int32_t*) tw2;
   716     p_tw3 = (int32_t*) tw3;
   717     q2_tmp_0 = vzipq_s32 (q_out_r048c, q_out_r159d);
   718     q2_tmp_1 = vzipq_s32 (q_out_i048c, q_out_i159d);
   719     q2_tmp_2 = vzipq_s32 (q_out_r26ae, q_out_r37bf);
   720     q2_tmp_3 = vzipq_s32 (q_out_i26ae, q_out_i37bf);
   721     q_in_r0123 = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[0]), vget_low_s32 (q2_tmp_2.val[0]));
   722     q_in_i0123 = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[0]), vget_low_s32 (q2_tmp_3.val[0]));
   723     q_in_r4567 = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[0]), vget_high_s32 (q2_tmp_2.val[0]));
   724     q_in_i4567 = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[0]), vget_high_s32 (q2_tmp_3.val[0]));
   725     q_in_r89ab = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[1]), vget_low_s32 (q2_tmp_2.val[1]));
   726     q_in_i89ab = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[1]), vget_low_s32 (q2_tmp_3.val[1]));
   727     q_in_rcdef = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[1]), vget_high_s32 (q2_tmp_2.val[1]));
   728     q_in_icdef = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[1]), vget_high_s32 (q2_tmp_3.val[1]));
   729     q2_tw1 = vld2q_s32 (p_tw1);
   730     q2_tw2 = vld2q_s32 (p_tw2);
   731     q2_tw3 = vld2q_s32 (p_tw3);
   733     q_s0_r = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[0]);
   734     q_s0_i = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[1]);
   735     q_s1_r = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[0]);
   736     q_s1_i = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[1]);
   737     q_s2_r = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[0]);
   738     q_s2_i = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[1]);
   739     q_tmp0 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[1]);
   740     q_tmp1 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[0]);
   741     q_tmp2 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[1]);
   742     q_tmp3 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[0]);
   743     q_tmp4 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[1]);
   744     q_tmp5 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[0]);
   746     q_s0_r = vsubq_s32 (q_s0_r, q_tmp0);
   747     q_s0_i = vaddq_s32 (q_s0_i, q_tmp1);
   748     q_s1_r = vsubq_s32 (q_s1_r, q_tmp2);
   749     q_s1_i = vaddq_s32 (q_s1_i, q_tmp3);
   750     q_s2_r = vsubq_s32 (q_s2_r, q_tmp4);
   751     q_s2_i = vaddq_s32 (q_s2_i, q_tmp5);
   753     q_s5_r = vhsubq_s32 (q_in_r0123, q_s1_r);
   754     q_s5_i = vhsubq_s32 (q_in_i0123, q_s1_i);
   755     q2_out_0123.val[0] = vhaddq_s32 (q_in_r0123, q_s1_r);
   756     q2_out_0123.val[1] = vhaddq_s32 (q_in_i0123, q_s1_i);
   758     q_s3_r = vhaddq_s32 (q_s0_r, q_s2_r);
   759     q_s3_i = vhaddq_s32 (q_s0_i, q_s2_i);
   760     q_s4_r = vhsubq_s32 (q_s0_r, q_s2_r);
   761     q_s4_i = vhsubq_s32 (q_s0_i, q_s2_i);
   763     q2_out_89ab.val[0] = vhsubq_s32 (q2_out_0123.val[0], q_s3_r);
   764     q2_out_89ab.val[1] = vhsubq_s32 (q2_out_0123.val[1], q_s3_i);
   765     q2_out_0123.val[0] = vhaddq_s32 (q2_out_0123.val[0], q_s3_r);
   766     q2_out_0123.val[1] = vhaddq_s32 (q2_out_0123.val[1], q_s3_i);
   768     q2_out_4567.val[0] = vhaddq_s32 (q_s5_r, q_s4_i);
   769     q2_out_4567.val[1] = vhsubq_s32 (q_s5_i, q_s4_r);
   770     q2_out_cdef.val[0] = vhsubq_s32 (q_s5_r, q_s4_i);
   771     q2_out_cdef.val[1] = vhaddq_s32 (q_s5_i, q_s4_r);
   773     vst2q_s32 (p_dst0, q2_out_0123);
   774     vst2q_s32 (p_dst1, q2_out_4567);
   775     vst2q_s32 (p_dst2, q2_out_89ab);
   776     vst2q_s32 (p_dst3, q2_out_cdef);
   786     int32_t *p_src0, *p_src4, *p_src8, *p_src12;
   787     int32x4x2_t q2_in_0123, q2_in_4567, q2_in_89ab, q2_in_cdef;
   788     int32x4_t q_t0_r,  q_t0_i, q_t1_r,  q_t1_i, q_t2_r,  q_t2_i, q_t3_r, q_t3_i;
   789     int32x4_t q_out_r048c,  q_out_i048c, q_out_r159d,  q_out_i159d;
   790     int32x4_t q_out_r26ae,  q_out_i26ae, q_out_r37bf,  q_out_i37bf;
   791     p_src0 = (int32_t*) (& (Fin[0]));
   792     p_src4 = (int32_t*) (& (Fin[4]));
   793     p_src8 = (int32_t*) (& (Fin[8]));
   794     p_src12 = (int32_t*) (& (Fin[12]));
   795     q2_in_0123 = vld2q_s32 (p_src0);
   796     q2_in_4567 = vld2q_s32 (p_src4);
   797     q2_in_89ab = vld2q_s32 (p_src8);
   798     q2_in_cdef = vld2q_s32 (p_src12);
   800     q_t2_r = vhsubq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
   801     q_t2_i = vhsubq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
   802     q_t3_r = vhaddq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
   803     q_t3_i = vhaddq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
   805     q_t0_r = vhaddq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
   806     q_t0_i = vhaddq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
   807     q_t1_r = vhsubq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
   808     q_t1_i = vhsubq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
   810     q_out_r26ae = vhsubq_s32 (q_t3_r, q_t0_r);
   811     q_out_i26ae = vhsubq_s32 (q_t3_i, q_t0_i);
   812     q_out_r048c = vhaddq_s32 (q_t3_r, q_t0_r);
   813     q_out_i048c = vhaddq_s32 (q_t3_i, q_t0_i);
   814     q_out_r159d = vhsubq_s32 (q_t2_r, q_t1_i);
   815     q_out_i159d = vhaddq_s32 (q_t2_i, q_t1_r);
   816     q_out_r37bf = vhaddq_s32 (q_t2_r, q_t1_i);
   817     q_out_i37bf = vhsubq_s32 (q_t2_i, q_t1_r);
   820     int32_t *p_dst0, *p_dst1, *p_dst2, *p_dst3;
   821     int32_t *p_tw1, *p_tw2, *p_tw3;
   822     int32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i;
   823     int32x4_t q_s3_r, q_s3_i, q_s4_r, q_s4_i, q_s5_r, q_s5_i;
   824     int32x4x2_t q2_tmp_0, q2_tmp_1, q2_tmp_2, q2_tmp_3;
   825     int32x4_t q_in_r0123, q_in_r4567, q_in_r89ab, q_in_rcdef;
   826     int32x4_t q_in_i0123, q_in_i4567, q_in_i89ab, q_in_icdef;
   827     int32x4x2_t q2_tw1, q2_tw2, q2_tw3;
   828     int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3, q_tmp4, q_tmp5;
   829     int32x4x2_t q2_out_0123, q2_out_4567, q2_out_89ab, q2_out_cdef;
   833     p_dst0 = (int32_t*) (&Fout[0]);
   834     p_dst1 = (int32_t*) (&Fout[4]);
   835     p_dst2 = (int32_t*) (&Fout[8]);
   836     p_dst3 = (int32_t*) (&Fout[12]);
   837     p_tw1 = (int32_t*) tw1;
   838     p_tw2 = (int32_t*) tw2;
   839     p_tw3 = (int32_t*) tw3;
   840     q2_tmp_0 = vzipq_s32 (q_out_r048c, q_out_r159d);
   841     q2_tmp_1 = vzipq_s32 (q_out_i048c, q_out_i159d);
   842     q2_tmp_2 = vzipq_s32 (q_out_r26ae, q_out_r37bf);
   843     q2_tmp_3 = vzipq_s32 (q_out_i26ae, q_out_i37bf);
   844     q_in_r0123 = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[0]), vget_low_s32 (q2_tmp_2.val[0]));
   845     q_in_i0123 = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[0]), vget_low_s32 (q2_tmp_3.val[0]));
   846     q_in_r4567 = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[0]), vget_high_s32 (q2_tmp_2.val[0]));
   847     q_in_i4567 = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[0]), vget_high_s32 (q2_tmp_3.val[0]));
   848     q_in_r89ab = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[1]), vget_low_s32 (q2_tmp_2.val[1]));
   849     q_in_i89ab = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[1]), vget_low_s32 (q2_tmp_3.val[1]));
   850     q_in_rcdef = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[1]), vget_high_s32 (q2_tmp_2.val[1]));
   851     q_in_icdef = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[1]), vget_high_s32 (q2_tmp_3.val[1]));
   852     q2_tw1 = vld2q_s32 (p_tw1);
   853     q2_tw2 = vld2q_s32 (p_tw2);
   854     q2_tw3 = vld2q_s32 (p_tw3);
   856     q_s0_r = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[0]);
   857     q_s0_i = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[0]);
   858     q_s1_r = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[0]);
   859     q_s1_i = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[0]);
   860     q_s2_r = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[0]);
   861     q_s2_i = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[0]);
   862     q_tmp0 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[1]);
   863     q_tmp1 = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[1]);
   864     q_tmp2 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[1]);
   865     q_tmp3 = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[1]);
   866     q_tmp4 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[1]);
   867     q_tmp5 = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[1]);
   868     q_s0_r = vaddq_s32 (q_s0_r, q_tmp0);
   869     q_s0_i = vsubq_s32 (q_s0_i, q_tmp1);
   870     q_s1_r = vaddq_s32 (q_s1_r, q_tmp2);
   871     q_s1_i = vsubq_s32 (q_s1_i, q_tmp3);
   872     q_s2_r = vaddq_s32 (q_s2_r, q_tmp4);
   873     q_s2_i = vsubq_s32 (q_s2_i, q_tmp5);
   875     q_s5_r = vhsubq_s32 (q_in_r0123, q_s1_r);
   876     q_s5_i = vhsubq_s32 (q_in_i0123, q_s1_i);
   877     q2_out_0123.val[0] = vhaddq_s32 (q_in_r0123, q_s1_r);
   878     q2_out_0123.val[1] = vhaddq_s32 (q_in_i0123, q_s1_i);
   880     q_s3_r = vhaddq_s32 (q_s0_r, q_s2_r);
   881     q_s3_i = vhaddq_s32 (q_s0_i, q_s2_i);
   882     q_s4_r = vhsubq_s32 (q_s0_r, q_s2_r);
   883     q_s4_i = vhsubq_s32 (q_s0_i, q_s2_i);
   885     q2_out_89ab.val[0] = vhsubq_s32 (q2_out_0123.val[0], q_s3_r);
   886     q2_out_89ab.val[1] = vhsubq_s32 (q2_out_0123.val[1], q_s3_i);
   887     q2_out_0123.val[0] = vhaddq_s32 (q2_out_0123.val[0], q_s3_r);
   888     q2_out_0123.val[1] = vhaddq_s32 (q2_out_0123.val[1], q_s3_i);
   890     q2_out_4567.val[0] = vhsubq_s32 (q_s5_r, q_s4_i);
   891     q2_out_4567.val[1] = vhaddq_s32 (q_s5_i, q_s4_r);
   892     q2_out_cdef.val[0] = vhaddq_s32 (q_s5_r, q_s4_i);
   893     q2_out_cdef.val[1] = vhsubq_s32 (q_s5_i, q_s4_r);
   895     vst2q_s32 (p_dst0, q2_out_0123);
   896     vst2q_s32 (p_dst1, q2_out_4567);
   897     vst2q_s32 (p_dst2, q2_out_89ab);
   898     vst2q_s32 (p_dst3, q2_out_cdef);
   910     int32x4x2_t q2_fpk, q2_fpnk, q2_tw, q2_dst, q2_dst2;
   911     int32x4_t q_fpnk_r, q_fpnk_i;
   912     int32x4_t q_f1k_r, q_f1k_i, q_f2k_r, q_f2k_i;
   913     int32x4_t q_tw_r, q_tw_i;
   914     int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
   915     int32x4_t q_dst2_r, q_dst2_i;
   916     int32_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
   924     dst[0].
r = tdc.
r + tdc.
i;
   925     dst[ncfft].
r = tdc.
r - tdc.
i;
   926     dst[ncfft].
i = dst[0].
i = 0;
   932             for (k = 1; k <= count ; k += 4)
   934                 p_src  = (int32_t*) (& (src[k]));
   935                 p_src2  = (int32_t*) (& (src[ncfft - k - 3]));
   936                 p_twiddles  = (int32_t*) (& (twiddles[k - 1]));
   937                 p_dst  = (int32_t*) (& (dst[k]));
   938                 p_dst2  = (int32_t*) (& (dst[ncfft - k - 3]));
   940                 q2_fpk  = vld2q_s32 (p_src);
   941                 q2_fpnk = vld2q_s32 (p_src2);
   943                 q2_tw = vld2q_s32 (p_twiddles);
   944                 q2_fpnk.val[0] = vrev64q_s32 (q2_fpnk.val[0]);
   945                 q2_fpnk.val[1] = vrev64q_s32 (q2_fpnk.val[1]);
   946                 q_fpnk_r = vcombine_s32 (vget_high_s32 (q2_fpnk.val[0]), vget_low_s32 (q2_fpnk.val[0]));
   947                 q_fpnk_i = vcombine_s32 (vget_high_s32 (q2_fpnk.val[1]), vget_low_s32 (q2_fpnk.val[1]));
   948                 q_fpnk_i = vnegq_s32 (q_fpnk_i);
   950                 q_f1k_r = vhaddq_s32 (q2_fpk.val[0], q_fpnk_r);
   951                 q_f1k_i = vhaddq_s32 (q2_fpk.val[1], q_fpnk_i);
   953                 q_f2k_r = vhsubq_s32 (q2_fpk.val[0], q_fpnk_r);
   954                 q_f2k_i = vhsubq_s32 (q2_fpk.val[1], q_fpnk_i);
   956                 q_tmp0 = vqdmulhq_s32 (q_f2k_r, q2_tw.val[0]);
   957                 q_tmp1 = vqdmulhq_s32 (q_f2k_i, q2_tw.val[1]);
   958                 q_tmp2 = vqdmulhq_s32 (q_f2k_r, q2_tw.val[1]);
   959                 q_tmp3 = vqdmulhq_s32 (q_f2k_i, q2_tw.val[0]);
   960                 q_tw_r = vsubq_s32 (q_tmp0, q_tmp1);
   961                 q_tw_i = vaddq_s32 (q_tmp2, q_tmp3);
   963                 q_dst2_r = vhsubq_s32 (q_f1k_r, q_tw_r);
   964                 q_dst2_i = vhsubq_s32 (q_tw_i, q_f1k_i);
   965                 q2_dst.val[0] = vhaddq_s32 (q_f1k_r, q_tw_r);
   966                 q2_dst.val[1] = vhaddq_s32 (q_f1k_i, q_tw_i);
   967                 q_dst2_r = vrev64q_s32 (q_dst2_r);
   968                 q_dst2_i = vrev64q_s32 (q_dst2_i);
   969                 q2_dst2.val[0] = vcombine_s32 (vget_high_s32 (q_dst2_r), vget_low_s32 (q_dst2_r));
   970                 q2_dst2.val[1] = vcombine_s32 (vget_high_s32 (q_dst2_i), vget_low_s32 (q_dst2_i));
   971                 vst2q_s32 (p_dst, q2_dst);
   972                 vst2q_s32 (p_dst2, q2_dst2);
   978             for (k = 1; k <= count ; k += 4)
   980                 p_src  = (int32_t*) (& (src[k]));
   981                 p_src2  = (int32_t*) (& (src[ncfft - k - 3]));
   982                 p_twiddles  = (int32_t*) (& (twiddles[k - 1]));
   983                 p_dst  = (int32_t*) (& (dst[k]));
   984                 p_dst2  = (int32_t*) (& (dst[ncfft - k - 3]));
   986                 q2_fpk  = vld2q_s32 (p_src);
   987                 q2_fpnk = vld2q_s32 (p_src2);
   989                 q2_tw = vld2q_s32 (p_twiddles);
   990                 q2_fpnk.val[0] = vrev64q_s32 (q2_fpnk.val[0]);
   991                 q2_fpnk.val[1] = vrev64q_s32 (q2_fpnk.val[1]);
   992                 q_fpnk_r = vcombine_s32 (vget_high_s32 (q2_fpnk.val[0]), vget_low_s32 (q2_fpnk.val[0]));
   993                 q_fpnk_i = vcombine_s32 (vget_high_s32 (q2_fpnk.val[1]), vget_low_s32 (q2_fpnk.val[1]));
   994                 q_fpnk_i = vnegq_s32 (q_fpnk_i);
   996                 q_f1k_r = vaddq_s32 (q2_fpk.val[0], q_fpnk_r);
   997                 q_f1k_i = vaddq_s32 (q2_fpk.val[1], q_fpnk_i);
   999                 q_f2k_r = vsubq_s32 (q2_fpk.val[0], q_fpnk_r);
  1000                 q_f2k_i = vsubq_s32 (q2_fpk.val[1], q_fpnk_i);
  1002                 q_tmp0 = vqdmulhq_s32 (q_f2k_r, q2_tw.val[0]);
  1003                 q_tmp1 = vqdmulhq_s32 (q_f2k_i, q2_tw.val[1]);
  1004                 q_tmp2 = vqdmulhq_s32 (q_f2k_r, q2_tw.val[1]);
  1005                 q_tmp3 = vqdmulhq_s32 (q_f2k_i, q2_tw.val[0]);
  1006                 q_tw_r = vsubq_s32 (q_tmp0, q_tmp1);
  1007                 q_tw_i = vaddq_s32 (q_tmp2, q_tmp3);
  1009                 q_dst2_r = vhsubq_s32 (q_f1k_r, q_tw_r);
  1010                 q_dst2_i = vhsubq_s32 (q_tw_i, q_f1k_i);
  1011                 q2_dst.val[0] = vhaddq_s32 (q_f1k_r, q_tw_r);
  1012                 q2_dst.val[1] = vhaddq_s32 (q_f1k_i, q_tw_i);
  1013                 q_dst2_r = vrev64q_s32 (q_dst2_r);
  1014                 q_dst2_i = vrev64q_s32 (q_dst2_i);
  1015                 q2_dst2.val[0] = vcombine_s32 (vget_high_s32 (q_dst2_r), vget_low_s32 (q_dst2_r));
  1016                 q2_dst2.val[1] = vcombine_s32 (vget_high_s32 (q_dst2_i), vget_low_s32 (q_dst2_i));
  1017                 vst2q_s32 (p_dst, q2_dst);
  1018                 vst2q_s32 (p_dst2, q2_dst2);
  1026         for (k = 1; k <= ncfft / 2 ; ++k)
  1029             fpnk.
r =   src[ncfft - k].
r;
  1030             fpnk.
i = - src[ncfft - k].
i;
  1037             f1k.
r = fpk.
r + fpnk.
r;
  1038             f1k.
i = fpk.
i + fpnk.
i;
  1040             f2k.
r = fpk.
r - fpnk.
r;
  1041             f2k.
i = fpk.
i - fpnk.
i;
  1046             dst[k].
r = (f1k.
r + tw.
r) >> 1;
  1047             dst[k].
i = (f1k.
i + tw.
i) >> 1;
  1048             dst[ncfft - k].
r = (f1k.
r - tw.
r) >> 1;
  1049             dst[ncfft - k].
i = (tw.
i - f1k.
i) >> 1;
  1064     int32x4x2_t q2_fk, q2_fnkc, q2_tw, q2_dst, q2_dst2;
  1065     int32x4_t q_fnkc_r, q_fnkc_i;
  1066     int32x4_t q_fek_r, q_fek_i, q_fok_r, q_fok_i;
  1067     int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
  1068     int32x4_t q_dst2_r, q_dst2_i;
  1069     int32_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
  1072     dst[0].
r = src[0].
r + src[ncfft].
r;
  1073     dst[0].
i = src[0].
r - src[ncfft].
r;
  1080             for (k = 1; k <= count ; k += 4)
  1082                 p_src  = (int32_t*) (& (src[k]));
  1083                 p_src2  = (int32_t*) (& (src[ncfft - k - 3]));
  1084                 p_twiddles  = (int32_t*) (& (twiddles[k - 1]));
  1085                 p_dst  = (int32_t*) (& (dst[k]));
  1086                 p_dst2  = (int32_t*) (& (dst[ncfft - k - 3]));
  1088                 q2_fk  = vld2q_s32 (p_src);
  1089                 q2_fnkc = vld2q_s32 (p_src2);
  1090                 q2_tw = vld2q_s32 (p_twiddles);
  1091                 q2_fnkc.val[0] = vrev64q_s32 (q2_fnkc.val[0]);
  1092                 q2_fnkc.val[1] = vrev64q_s32 (q2_fnkc.val[1]);
  1093                 q_fnkc_r = vcombine_s32 (vget_high_s32 (q2_fnkc.val[0]), vget_low_s32 (q2_fnkc.val[0]));
  1094                 q_fnkc_i = vcombine_s32 (vget_high_s32 (q2_fnkc.val[1]), vget_low_s32 (q2_fnkc.val[1]));
  1095                 q_fnkc_i = vnegq_s32 (q_fnkc_i);
  1097                 q_fek_r = vhaddq_s32 (q2_fk.val[0], q_fnkc_r);
  1098                 q_fek_i = vhaddq_s32 (q2_fk.val[1], q_fnkc_i);
  1099                 q_tmp0 = vhsubq_s32 (q2_fk.val[0], q_fnkc_r);
  1100                 q_tmp1 = vhsubq_s32 (q2_fk.val[1], q_fnkc_i);
  1102                 q_fok_r = vqdmulhq_s32 (q_tmp0, q2_tw.val[0]);
  1103                 q_fok_i = vqdmulhq_s32 (q_tmp1, q2_tw.val[0]);
  1104                 q_tmp2 = vqdmulhq_s32 (q_tmp1, q2_tw.val[1]);
  1105                 q_tmp3 = vqdmulhq_s32 (q_tmp0, q2_tw.val[1]);
  1106                 q_fok_r = vaddq_s32 (q_fok_r, q_tmp2);
  1107                 q_fok_i = vsubq_s32 (q_fok_i, q_tmp3);
  1109                 q_dst2_r = vsubq_s32 (q_fek_r, q_fok_r);
  1110                 q_dst2_i = vsubq_s32 (q_fok_i, q_fek_i);
  1111                 q2_dst.val[0] = vaddq_s32 (q_fek_r, q_fok_r);
  1112                 q2_dst.val[1] = vaddq_s32 (q_fek_i, q_fok_i);
  1113                 q_dst2_r = vrev64q_s32 (q_dst2_r);
  1114                 q_dst2_i = vrev64q_s32 (q_dst2_i);
  1115                 q2_dst2.val[0] = vcombine_s32 (vget_high_s32 (q_dst2_r), vget_low_s32 (q_dst2_r));
  1116                 q2_dst2.val[1] = vcombine_s32 (vget_high_s32 (q_dst2_i), vget_low_s32 (q_dst2_i));
  1117                 vst2q_s32 (p_dst, q2_dst);
  1118                 vst2q_s32 (p_dst2, q2_dst2);
  1125             for (k = 1; k <= count ; k += 4)
  1127                 p_src  = (int32_t*) (& (src[k]));
  1128                 p_src2  = (int32_t*) (& (src[ncfft - k - 3]));
  1129                 p_twiddles  = (int32_t*) (& (twiddles[k - 1]));
  1130                 p_dst  = (int32_t*) (& (dst[k]));
  1131                 p_dst2  = (int32_t*) (& (dst[ncfft - k - 3]));
  1133                 q2_fk  = vld2q_s32 (p_src);
  1134                 q2_fnkc = vld2q_s32 (p_src2);
  1135                 q2_tw = vld2q_s32 (p_twiddles);
  1136                 q2_fnkc.val[0] = vrev64q_s32 (q2_fnkc.val[0]);
  1137                 q2_fnkc.val[1] = vrev64q_s32 (q2_fnkc.val[1]);
  1138                 q_fnkc_r = vcombine_s32 (vget_high_s32 (q2_fnkc.val[0]), vget_low_s32 (q2_fnkc.val[0]));
  1139                 q_fnkc_i = vcombine_s32 (vget_high_s32 (q2_fnkc.val[1]), vget_low_s32 (q2_fnkc.val[1]));
  1140                 q_fnkc_i = vnegq_s32 (q_fnkc_i);
  1142                 q_fek_r = vaddq_s32 (q2_fk.val[0], q_fnkc_r);
  1143                 q_fek_i = vaddq_s32 (q2_fk.val[1], q_fnkc_i);
  1144                 q_tmp0 = vsubq_s32 (q2_fk.val[0], q_fnkc_r);
  1145                 q_tmp1 = vsubq_s32 (q2_fk.val[1], q_fnkc_i);
  1147                 q_fok_r = vqdmulhq_s32 (q_tmp0, q2_tw.val[0]);
  1148                 q_fok_i = vqdmulhq_s32 (q_tmp1, q2_tw.val[0]);
  1149                 q_tmp2 = vqdmulhq_s32 (q_tmp1, q2_tw.val[1]);
  1150                 q_tmp3 = vqdmulhq_s32 (q_tmp0, q2_tw.val[1]);
  1151                 q_fok_r = vaddq_s32 (q_fok_r, q_tmp2);
  1152                 q_fok_i = vsubq_s32 (q_fok_i, q_tmp3);
  1154                 q_dst2_r = vsubq_s32 (q_fek_r, q_fok_r);
  1155                 q_dst2_i = vsubq_s32 (q_fok_i, q_fek_i);
  1156                 q2_dst.val[0] = vaddq_s32 (q_fek_r, q_fok_r);
  1157                 q2_dst.val[1] = vaddq_s32 (q_fek_i, q_fok_i);
  1158                 q_dst2_r = vrev64q_s32 (q_dst2_r);
  1159                 q_dst2_i = vrev64q_s32 (q_dst2_i);
  1160                 q2_dst2.val[0] = vcombine_s32 (vget_high_s32 (q_dst2_r), vget_low_s32 (q_dst2_r));
  1161                 q2_dst2.val[1] = vcombine_s32 (vget_high_s32 (q_dst2_i), vget_low_s32 (q_dst2_i));
  1162                 vst2q_s32 (p_dst, q2_dst);
  1163                 vst2q_s32 (p_dst2, q2_dst2);
  1171         for (k = 1; k <= ncfft / 2; k++)
  1174             fnkc.
r = src[ncfft - k].
r;
  1175             fnkc.
i = -src[ncfft - k].
i;
  1182             fek.
r = fk.
r + fnkc.
r;
  1183             fek.
i = fk.
i + fnkc.
i;
  1185             tmp.
r = fk.
r - fnkc.
r;
  1186             tmp.
i = fk.
i - fnkc.
i;
  1191             dst[k].
r = fek.
r + fok.
r;
  1192             dst[k].
i = fek.
i + fok.
i;
  1194             dst[ncfft - k].
r = fek.
r - fok.
r;
  1195             dst[ncfft - k].
i = fok.
i - fek.
i;
  1248                 ne10_fft4_backward_int32_scaled (fout, fin);
  1251                 ne10_fft8_backward_int32_scaled (fout, fin);
  1254                 ne10_fft16_backward_int32_scaled_neon (fout, fin, cfg->
twiddles);
  1266                 ne10_fft4_forward_int32_scaled (fout, fin);
  1269                 ne10_fft8_forward_int32_scaled (fout, fin);
  1272                 ne10_fft16_forward_int32_scaled_neon (fout, fin, cfg->
twiddles);
  1287                 ne10_fft4_backward_int32_unscaled (fout, fin);
  1290                 ne10_fft8_backward_int32_unscaled (fout, fin);
  1293                 ne10_fft16_backward_int32_unscaled_neon (fout, fin, cfg->
twiddles);
  1305                 ne10_fft4_forward_int32_unscaled (fout, fin);
  1308                 ne10_fft8_forward_int32_unscaled (fout, fin);
  1311                 ne10_fft16_forward_int32_unscaled_neon (fout, fin, cfg->
twiddles);
  1337     c2c_state.
buffer = tmpbuf2;
  1340     ne10_fft_split_r2c_1d_int32_neon (fout, tmpbuf1,  cfg->
super_twiddles, cfg->
ncfft, scaled_flag);
  1360     c2c_state.
buffer = tmpbuf2;
  1362     ne10_fft_split_c2r_1d_int32_neon (tmpbuf1, fin, cfg->
super_twiddles, cfg->
ncfft, scaled_flag);
 #define NE10_FFT_ALG_DEFAULT
void ne10_fft_c2c_1d_int32_neon(ne10_fft_cpx_int32_t *fout, ne10_fft_cpx_int32_t *fin, ne10_fft_cfg_int32_t cfg, ne10_int32_t inverse_fft, ne10_int32_t scaled_flag)
Specific implementation of ne10_fft_c2c_1d_int32 using NEON SIMD capabilities. 
ne10_fft_cpx_int32_t * twiddles
void ne10_mixed_radix_fft_backward_int32_scaled_neon(ne10_fft_cpx_int32_t *Fout, ne10_fft_cpx_int32_t *fin, ne10_int32_t *factors, ne10_fft_cpx_int32_t *twiddles, ne10_fft_cpx_int32_t *buffer) asm("ne10_mixed_radix_fft_backward_int32_scaled_neon")
void ne10_fft_c2r_1d_int32_neon(ne10_int32_t *fout, ne10_fft_cpx_int32_t *fin, ne10_fft_r2c_cfg_int32_t cfg, ne10_int32_t scaled_flag)
Specific implementation of ne10_fft_c2r_1d_int32 using NEON SIMD capabilities. 
void ne10_mixed_radix_fft_forward_int32_unscaled_neon(ne10_fft_cpx_int32_t *Fout, ne10_fft_cpx_int32_t *fin, ne10_int32_t *factors, ne10_fft_cpx_int32_t *twiddles, ne10_fft_cpx_int32_t *buffer) asm("ne10_mixed_radix_fft_forward_int32_unscaled_neon")
#define NE10_F2I32_SAMPPROD
#define NE10_F2I32_FIXDIV(c, div)
void ne10_fft_r2c_1d_int32_neon(ne10_fft_cpx_int32_t *fout, ne10_int32_t *fin, ne10_fft_r2c_cfg_int32_t cfg, ne10_int32_t scaled_flag)
Specific implementation of ne10_fft_r2c_1d_int32 using NEON SIMD capabilities. 
void ne10_fft_c2c_1d_int32_c(ne10_fft_cpx_int32_t *fout, ne10_fft_cpx_int32_t *fin, ne10_fft_cfg_int32_t cfg, ne10_int32_t inverse_fft, ne10_int32_t scaled_flag)
Specific implementation of ne10_fft_c2c_1d_int32 using plain C. 
void ne10_mixed_radix_fft_backward_int32_unscaled_neon(ne10_fft_cpx_int32_t *Fout, ne10_fft_cpx_int32_t *fin, ne10_int32_t *factors, ne10_fft_cpx_int32_t *twiddles, ne10_fft_cpx_int32_t *buffer) asm("ne10_mixed_radix_fft_backward_int32_unscaled_neon")
Structure for the 32-bit fixed point FFT function. 
void ne10_mixed_radix_generic_butterfly_inverse_int32_neon(ne10_fft_cpx_int32_t *Fout, const ne10_fft_cpx_int32_t *Fin, const ne10_int32_t *factors, const ne10_fft_cpx_int32_t *twiddles, ne10_fft_cpx_int32_t *buffer, const ne10_int32_t scaled_flag)
ne10_fft_cpx_int32_t * twiddles
void ne10_mixed_radix_generic_butterfly_int32_neon(ne10_fft_cpx_int32_t *Fout, const ne10_fft_cpx_int32_t *Fin, const ne10_int32_t *factors, const ne10_fft_cpx_int32_t *twiddles, ne10_fft_cpx_int32_t *buffer, const ne10_int32_t scaled_flag)
ne10_fft_cpx_int32_t * buffer
ne10_fft_cpx_int32_t * buffer
ne10_fft_cpx_int32_t * super_twiddles
void ne10_mixed_radix_fft_forward_int32_scaled_neon(ne10_fft_cpx_int32_t *Fout, ne10_fft_cpx_int32_t *fin, ne10_int32_t *factors, ne10_fft_cpx_int32_t *twiddles, ne10_fft_cpx_int32_t *buffer) asm("ne10_mixed_radix_fft_forward_int32_scaled_neon")