39 #define FFT4_FS_START \ 40 ne10_int32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i; \ 41 ne10_int32_t tmp_r, tmp_i; 45 s2_r = Fin[0].r - Fin[2].r; \ 46 s2_i = Fin[0].i - Fin[2].i; \ 47 tmp_r = Fin[0].r + Fin[2].r; \ 48 tmp_i = Fin[0].i + Fin[2].i; \ 49 s0_r = Fin[1].r + Fin[3].r; \ 50 s0_i = Fin[1].i + Fin[3].i; \ 51 s1_r = Fin[1].r - Fin[3].r; \ 52 s1_i = Fin[1].i - Fin[3].i; 54 #define FFT4_FS_SCALED \ 55 s2_r = (Fin[0].r - Fin[2].r) >> 2; \ 56 s2_i = (Fin[0].i - Fin[2].i) >> 2; \ 57 tmp_r = (Fin[0].r + Fin[2].r) >> 2; \ 58 tmp_i = (Fin[0].i + Fin[2].i) >> 2; \ 59 s0_r = (Fin[1].r + Fin[3].r) >> 2; \ 60 s0_i = (Fin[1].i + Fin[3].i) >> 2; \ 61 s1_r = (Fin[1].r - Fin[3].r) >> 2; \ 62 s1_i = (Fin[1].i - Fin[3].i) >> 2; 65 Fout[2].r = tmp_r - s0_r; \ 66 Fout[2].i = tmp_i - s0_i; \ 67 Fout[0].r = tmp_r + s0_r; \ 68 Fout[0].i = tmp_i + s0_i; \ 69 Fout[1].r = s2_r + s1_i; \ 70 Fout[1].i = s2_i - s1_r; \ 71 Fout[3].r = s2_r - s1_i; \ 72 Fout[3].i = s2_i + s1_r; 75 Fout[2].r = tmp_r - s0_r; \ 76 Fout[2].i = tmp_i - s0_i; \ 77 Fout[0].r = tmp_r + s0_r; \ 78 Fout[0].i = tmp_i + s0_i; \ 79 Fout[1].r = s2_r - s1_i; \ 80 Fout[1].i = s2_i + s1_r; \ 81 Fout[3].r = s2_r + s1_i; \ 82 Fout[3].i = s2_i - s1_r; 119 #define FFT8_FS_START \ 120 ne10_int32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i; \ 121 ne10_int32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i; \ 122 const ne10_int32_t TW_81 = 1518500249; 125 s0_r = Fin[0].r + Fin[4].r; \ 126 s0_i = Fin[0].i + Fin[4].i; \ 127 s1_r = Fin[0].r - Fin[4].r; \ 128 s1_i = Fin[0].i - Fin[4].i; \ 129 s2_r = Fin[1].r + Fin[5].r; \ 130 s2_i = Fin[1].i + Fin[5].i; \ 131 s3_r = Fin[1].r - Fin[5].r; \ 132 s3_i = Fin[1].i - Fin[5].i; \ 133 s4_r = Fin[2].r + Fin[6].r; \ 134 s4_i = Fin[2].i + Fin[6].i; \ 135 s5_r = Fin[2].r - Fin[6].r; \ 136 s5_i = Fin[2].i - Fin[6].i; \ 137 s6_r = Fin[3].r + Fin[7].r; \ 138 s6_i = Fin[3].i + Fin[7].i; \ 139 s7_r = Fin[3].r - Fin[7].r; \ 140 s7_i = Fin[3].i - Fin[7].i; 142 #define FFT8_FS_SCALED \ 143 s0_r = (Fin[0].r + Fin[4].r) >> 3; \ 144 s0_i = (Fin[0].i + Fin[4].i) >> 3; \ 145 s1_r = (Fin[0].r - Fin[4].r) >> 3; \ 146 s1_i = (Fin[0].i - Fin[4].i) >> 3; \ 147 s2_r = (Fin[1].r + Fin[5].r) >> 3; \ 148 s2_i = (Fin[1].i + Fin[5].i) >> 3; \ 149 s3_r = (Fin[1].r - Fin[5].r) >> 3; \ 150 s3_i = (Fin[1].i - Fin[5].i) >> 3; \ 151 s4_r = (Fin[2].r + Fin[6].r) >> 3; \ 152 s4_i = (Fin[2].i + Fin[6].i) >> 3; \ 153 s5_r = (Fin[2].r - Fin[6].r) >> 3; \ 154 s5_i = (Fin[2].i - Fin[6].i) >> 3; \ 155 s6_r = (Fin[3].r + Fin[7].r) >> 3; \ 156 s6_i = (Fin[3].i + Fin[7].i) >> 3; \ 157 s7_r = (Fin[3].r - Fin[7].r) >> 3; \ 158 s7_i = (Fin[3].i - Fin[7].i) >> 3; 161 #define FFT8_FWD_LS \ 162 t0_r = s0_r - s4_r; \ 163 t0_i = s0_i - s4_i; \ 164 t1_r = s0_r + s4_r; \ 165 t1_i = s0_i + s4_i; \ 166 t2_r = s2_r + s6_r; \ 167 t2_i = s2_i + s6_i; \ 168 t3_r = s2_r - s6_r; \ 169 t3_i = s2_i - s6_i; \ 170 Fout[0].r = t1_r + t2_r; \ 171 Fout[0].i = t1_i + t2_i; \ 172 Fout[4].r = t1_r - t2_r; \ 173 Fout[4].i = t1_i - t2_i; \ 174 Fout[2].r = t0_r + t3_i; \ 175 Fout[2].i = t0_i - t3_r; \ 176 Fout[6].r = t0_r - t3_i; \ 177 Fout[6].i = t0_i + t3_r; \ 178 t4_r = (ne10_int32_t) ( ( (ne10_int64_t) (s3_r + s3_i) * TW_81) >> 31); \ 179 t4_i = - (ne10_int32_t) ( ( (ne10_int64_t) (s3_r - s3_i) * TW_81) >> 31); \ 180 t5_r = (ne10_int32_t) ( ( (ne10_int64_t) (s7_r - s7_i) * TW_81) >> 31); \ 181 t5_i = (ne10_int32_t) ( ( (ne10_int64_t) (s7_r + s7_i) * TW_81) >> 31); \ 182 t0_r = s1_r - s5_i; \ 183 t0_i = s1_i + s5_r; \ 184 t1_r = s1_r + s5_i; \ 185 t1_i = s1_i - s5_r; \ 186 t2_r = t4_r - t5_r; \ 187 t2_i = t4_i - t5_i; \ 188 t3_r = t4_r + t5_r; \ 189 t3_i = t4_i + t5_i; \ 190 Fout[1].r = t1_r + t2_r; \ 191 Fout[1].i = t1_i + t2_i; \ 192 Fout[5].r = t1_r - t2_r; \ 193 Fout[5].i = t1_i - t2_i; \ 194 Fout[3].r = t0_r + t3_i; \ 195 Fout[3].i = t0_i - t3_r; \ 196 Fout[7].r = t0_r - t3_i; \ 197 Fout[7].i = t0_i + t3_r; 199 #define FFT8_INV_LS \ 200 t0_r = s0_r - s4_r; \ 201 t0_i = s0_i - s4_i; \ 202 t1_r = s0_r + s4_r; \ 203 t1_i = s0_i + s4_i; \ 204 t2_r = s2_r + s6_r; \ 205 t2_i = s2_i + s6_i; \ 206 t3_r = s2_r - s6_r; \ 207 t3_i = s2_i - s6_i; \ 208 Fout[0].r = t1_r + t2_r; \ 209 Fout[0].i = t1_i + t2_i; \ 210 Fout[4].r = t1_r - t2_r; \ 211 Fout[4].i = t1_i - t2_i; \ 212 Fout[2].r = t0_r - t3_i; \ 213 Fout[2].i = t0_i + t3_r; \ 214 Fout[6].r = t0_r + t3_i; \ 215 Fout[6].i = t0_i - t3_r; \ 216 t4_r = (ne10_int32_t) ( ( (ne10_int64_t) (s3_r - s3_i) * TW_81) >> 31); \ 217 t4_i = (ne10_int32_t) ( ( (ne10_int64_t) (s3_r + s3_i) * TW_81) >> 31); \ 218 t5_r = (ne10_int32_t) ( ( (ne10_int64_t) (s7_r + s7_i) * TW_81) >> 31); \ 219 t5_i = - (ne10_int32_t) ( ( (ne10_int64_t) (s7_r - s7_i) * TW_81) >> 31); \ 220 t0_r = s1_r + s5_i; \ 221 t0_i = s1_i - s5_r; \ 222 t1_r = s1_r - s5_i; \ 223 t1_i = s1_i + s5_r; \ 224 t2_r = t4_r - t5_r; \ 225 t2_i = t4_i - t5_i; \ 226 t3_r = t4_r + t5_r; \ 227 t3_i = t4_i + t5_i; \ 228 Fout[1].r = t1_r + t2_r; \ 229 Fout[1].i = t1_i + t2_i; \ 230 Fout[5].r = t1_r - t2_r; \ 231 Fout[5].i = t1_i - t2_i; \ 232 Fout[3].r = t0_r - t3_i; \ 233 Fout[3].i = t0_i + t3_r; \ 234 Fout[7].r = t0_r + t3_i; \ 235 Fout[7].i = t0_i - t3_r; 271 #define FFT16_FS_START \ 272 ne10_fft_cpx_int32_t *tw1, *tw2, *tw3; \ 273 int32_t *p_src0, *p_src4, *p_src8, *p_src12; \ 274 int32x4x2_t q2_in_0123, q2_in_4567, q2_in_89ab, q2_in_cdef; \ 275 int32x4_t q_t0_r, q_t0_i, q_t1_r, q_t1_i, q_t2_r, q_t2_i, q_t3_r, q_t3_i; \ 276 int32x4_t q_out_r048c, q_out_i048c, q_out_r159d, q_out_i159d; \ 277 int32x4_t q_out_r26ae, q_out_i26ae, q_out_r37bf, q_out_i37bf; 279 #define FFT16_LS_START \ 280 int32_t *p_dst0, *p_dst1, *p_dst2, *p_dst3; \ 281 int32_t *p_tw1, *p_tw2, *p_tw3; \ 282 int32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i; \ 283 int32x4_t q_s3_r, q_s3_i, q_s4_r, q_s4_i, q_s5_r, q_s5_i; \ 284 int32x4x2_t q2_tmp_0, q2_tmp_1, q2_tmp_2, q2_tmp_3; \ 285 int32x4_t q_in_r0123, q_in_r4567, q_in_r89ab, q_in_rcdef; \ 286 int32x4_t q_in_i0123, q_in_i4567, q_in_i89ab, q_in_icdef; \ 287 int32x4x2_t q2_tw1, q2_tw2, q2_tw3; \ 288 int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3, q_tmp4, q_tmp5; \ 289 int32x4x2_t q2_out_0123, q2_out_4567, q2_out_89ab, q2_out_cdef; 292 p_src0 = (int32_t*) (& (Fin[0])); \ 293 p_src4 = (int32_t*) (& (Fin[4])); \ 294 p_src8 = (int32_t*) (& (Fin[8])); \ 295 p_src12 = (int32_t*) (& (Fin[12])); \ 296 q2_in_0123 = vld2q_s32 (p_src0); \ 297 q2_in_4567 = vld2q_s32 (p_src4); \ 298 q2_in_89ab = vld2q_s32 (p_src8); \ 299 q2_in_cdef = vld2q_s32 (p_src12); \ 300 q_t2_r = vsubq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]); \ 301 q_t2_i = vsubq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]); \ 302 q_t3_r = vaddq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]); \ 303 q_t3_i = vaddq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]); \ 304 q_t0_r = vaddq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]); \ 305 q_t0_i = vaddq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]); \ 306 q_t1_r = vsubq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]); \ 307 q_t1_i = vsubq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]); \ 308 q_out_r26ae = vsubq_s32 (q_t3_r, q_t0_r); \ 309 q_out_i26ae = vsubq_s32 (q_t3_i, q_t0_i); \ 310 q_out_r048c = vaddq_s32 (q_t3_r, q_t0_r); \ 311 q_out_i048c = vaddq_s32 (q_t3_i, q_t0_i); 313 #define FFT16_FS_SCALED \ 314 p_src0 = (int32_t*) (& (Fin[0])); \ 315 p_src4 = (int32_t*) (& (Fin[4])); \ 316 p_src8 = (int32_t*) (& (Fin[8])); \ 317 p_src12 = (int32_t*) (& (Fin[12])); \ 318 q2_in_0123 = vld2q_s32 (p_src0); \ 319 q2_in_4567 = vld2q_s32 (p_src4); \ 320 q2_in_89ab = vld2q_s32 (p_src8); \ 321 q2_in_cdef = vld2q_s32 (p_src12); \ 322 q_t2_r = vhsubq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]); \ 323 q_t2_i = vhsubq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]); \ 324 q_t3_r = vhaddq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]); \ 325 q_t3_i = vhaddq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]); \ 326 q_t0_r = vhaddq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]); \ 327 q_t0_i = vhaddq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]); \ 328 q_t1_r = vhsubq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]); \ 329 q_t1_i = vhsubq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]); \ 330 q_out_r26ae = vhsubq_s32 (q_t3_r, q_t0_r); \ 331 q_out_i26ae = vhsubq_s32 (q_t3_i, q_t0_i); \ 332 q_out_r048c = vhaddq_s32 (q_t3_r, q_t0_r); \ 333 q_out_i048c = vhaddq_s32 (q_t3_i, q_t0_i); 335 #define FFT16_LS_LOAD \ 337 tw2 = twiddles + 4; \ 338 tw3 = twiddles + 8; \ 339 p_dst0 = (int32_t*) (&Fout[0]); \ 340 p_dst1 = (int32_t*) (&Fout[4]); \ 341 p_dst2 = (int32_t*) (&Fout[8]); \ 342 p_dst3 = (int32_t*) (&Fout[12]); \ 343 p_tw1 = (int32_t*) tw1; \ 344 p_tw2 = (int32_t*) tw2; \ 345 p_tw3 = (int32_t*) tw3; \ 346 q2_tmp_0 = vzipq_s32 (q_out_r048c, q_out_r159d); \ 347 q2_tmp_1 = vzipq_s32 (q_out_i048c, q_out_i159d); \ 348 q2_tmp_2 = vzipq_s32 (q_out_r26ae, q_out_r37bf); \ 349 q2_tmp_3 = vzipq_s32 (q_out_i26ae, q_out_i37bf); \ 350 q_in_r0123 = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[0]), vget_low_s32 (q2_tmp_2.val[0])); \ 351 q_in_i0123 = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[0]), vget_low_s32 (q2_tmp_3.val[0])); \ 352 q_in_r4567 = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[0]), vget_high_s32 (q2_tmp_2.val[0])); \ 353 q_in_i4567 = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[0]), vget_high_s32 (q2_tmp_3.val[0])); \ 354 q_in_r89ab = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[1]), vget_low_s32 (q2_tmp_2.val[1])); \ 355 q_in_i89ab = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[1]), vget_low_s32 (q2_tmp_3.val[1])); \ 356 q_in_rcdef = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[1]), vget_high_s32 (q2_tmp_2.val[1])); \ 357 q_in_icdef = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[1]), vget_high_s32 (q2_tmp_3.val[1])); \ 358 q2_tw1 = vld2q_s32 (p_tw1); \ 359 q2_tw2 = vld2q_s32 (p_tw2); \ 360 q2_tw3 = vld2q_s32 (p_tw3); 362 #define FFT16_FWD_LS \ 363 q_s0_r = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[0]); \ 364 q_s0_i = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[1]); \ 365 q_s1_r = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[0]); \ 366 q_s1_i = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[1]); \ 367 q_s2_r = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[0]); \ 368 q_s2_i = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[1]); \ 369 q_tmp0 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[1]); \ 370 q_tmp1 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[0]); \ 371 q_tmp2 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[1]); \ 372 q_tmp3 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[0]); \ 373 q_tmp4 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[1]); \ 374 q_tmp5 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[0]); 376 #define FFT16_INV_LS \ 377 q_s0_r = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[0]); \ 378 q_s0_i = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[0]); \ 379 q_s1_r = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[0]); \ 380 q_s1_i = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[0]); \ 381 q_s2_r = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[0]); \ 382 q_s2_i = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[0]); \ 383 q_tmp0 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[1]); \ 384 q_tmp1 = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[1]); \ 385 q_tmp2 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[1]); \ 386 q_tmp3 = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[1]); \ 387 q_tmp4 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[1]); \ 388 q_tmp5 = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[1]); 390 #define FFT16_FWD_LS_S0 \ 391 q_s0_r = vsubq_s32 (q_s0_r, q_tmp0); \ 392 q_s0_i = vaddq_s32 (q_s0_i, q_tmp1); \ 393 q_s1_r = vsubq_s32 (q_s1_r, q_tmp2); \ 394 q_s1_i = vaddq_s32 (q_s1_i, q_tmp3); \ 395 q_s2_r = vsubq_s32 (q_s2_r, q_tmp4); \ 396 q_s2_i = vaddq_s32 (q_s2_i, q_tmp5); 398 #define FFT16_INV_LS_S0 \ 399 q_s0_r = vaddq_s32 (q_s0_r, q_tmp0); \ 400 q_s0_i = vsubq_s32 (q_s0_i, q_tmp1); \ 401 q_s1_r = vaddq_s32 (q_s1_r, q_tmp2); \ 402 q_s1_i = vsubq_s32 (q_s1_i, q_tmp3); \ 403 q_s2_r = vaddq_s32 (q_s2_r, q_tmp4); \ 404 q_s2_i = vsubq_s32 (q_s2_i, q_tmp5); 406 #define FFT16_LS_02 \ 407 q_s5_r = vsubq_s32 (q_in_r0123, q_s1_r); \ 408 q_s5_i = vsubq_s32 (q_in_i0123, q_s1_i); \ 409 q2_out_0123.val[0] = vaddq_s32 (q_in_r0123, q_s1_r); \ 410 q2_out_0123.val[1] = vaddq_s32 (q_in_i0123, q_s1_i); \ 411 q_s3_r = vaddq_s32 (q_s0_r, q_s2_r); \ 412 q_s3_i = vaddq_s32 (q_s0_i, q_s2_i); \ 413 q_s4_r = vsubq_s32 (q_s0_r, q_s2_r); \ 414 q_s4_i = vsubq_s32 (q_s0_i, q_s2_i); \ 415 q2_out_89ab.val[0] = vsubq_s32 (q2_out_0123.val[0], q_s3_r); \ 416 q2_out_89ab.val[1] = vsubq_s32 (q2_out_0123.val[1], q_s3_i); \ 417 q2_out_0123.val[0] = vaddq_s32 (q2_out_0123.val[0], q_s3_r); \ 418 q2_out_0123.val[1] = vaddq_s32 (q2_out_0123.val[1], q_s3_i); 421 #define FFT16_LS_02_SCALED \ 422 q_s5_r = vhsubq_s32 (q_in_r0123, q_s1_r); \ 423 q_s5_i = vhsubq_s32 (q_in_i0123, q_s1_i); \ 424 q2_out_0123.val[0] = vhaddq_s32 (q_in_r0123, q_s1_r); \ 425 q2_out_0123.val[1] = vhaddq_s32 (q_in_i0123, q_s1_i); \ 426 q_s3_r = vhaddq_s32 (q_s0_r, q_s2_r); \ 427 q_s3_i = vhaddq_s32 (q_s0_i, q_s2_i); \ 428 q_s4_r = vhsubq_s32 (q_s0_r, q_s2_r); \ 429 q_s4_i = vhsubq_s32 (q_s0_i, q_s2_i); \ 430 q2_out_89ab.val[0] = vhsubq_s32 (q2_out_0123.val[0], q_s3_r); \ 431 q2_out_89ab.val[1] = vhsubq_s32 (q2_out_0123.val[1], q_s3_i); \ 432 q2_out_0123.val[0] = vhaddq_s32 (q2_out_0123.val[0], q_s3_r); \ 433 q2_out_0123.val[1] = vhaddq_s32 (q2_out_0123.val[1], q_s3_i); 436 vst2q_s32 (p_dst0, q2_out_0123); \ 437 vst2q_s32 (p_dst1, q2_out_4567); \ 438 vst2q_s32 (p_dst2, q2_out_89ab); \ 439 vst2q_s32 (p_dst3, q2_out_cdef); 448 q_out_r159d = vaddq_s32 (q_t2_r, q_t1_i);
449 q_out_i159d = vsubq_s32 (q_t2_i, q_t1_r);
450 q_out_r37bf = vsubq_s32 (q_t2_r, q_t1_i);
451 q_out_i37bf = vaddq_s32 (q_t2_i, q_t1_r);
460 q2_out_4567.val[0] = vaddq_s32 (q_s5_r, q_s4_i);
461 q2_out_4567.val[1] = vsubq_s32 (q_s5_i, q_s4_r);
462 q2_out_cdef.val[0] = vsubq_s32 (q_s5_r, q_s4_i);
463 q2_out_cdef.val[1] = vaddq_s32 (q_s5_i, q_s4_r);
475 q_out_r159d = vsubq_s32 (q_t2_r, q_t1_i);
476 q_out_i159d = vaddq_s32 (q_t2_i, q_t1_r);
477 q_out_r37bf = vaddq_s32 (q_t2_r, q_t1_i);
478 q_out_i37bf = vsubq_s32 (q_t2_i, q_t1_r);
487 q2_out_4567.val[0] = vsubq_s32 (q_s5_r, q_s4_i);
488 q2_out_4567.val[1] = vaddq_s32 (q_s5_i, q_s4_r);
489 q2_out_cdef.val[0] = vaddq_s32 (q_s5_r, q_s4_i);
490 q2_out_cdef.val[1] = vsubq_s32 (q_s5_i, q_s4_r);
502 q_out_r159d = vhaddq_s32 (q_t2_r, q_t1_i);
503 q_out_i159d = vhsubq_s32 (q_t2_i, q_t1_r);
504 q_out_r37bf = vhsubq_s32 (q_t2_r, q_t1_i);
505 q_out_i37bf = vhaddq_s32 (q_t2_i, q_t1_r);
514 q2_out_4567.val[0] = vhaddq_s32 (q_s5_r, q_s4_i);
515 q2_out_4567.val[1] = vhsubq_s32 (q_s5_i, q_s4_r);
516 q2_out_cdef.val[0] = vhsubq_s32 (q_s5_r, q_s4_i);
517 q2_out_cdef.val[1] = vhaddq_s32 (q_s5_i, q_s4_r);
529 q_out_r159d = vhsubq_s32 (q_t2_r, q_t1_i);
530 q_out_i159d = vhaddq_s32 (q_t2_i, q_t1_r);
531 q_out_r37bf = vhaddq_s32 (q_t2_r, q_t1_i);
532 q_out_i37bf = vhsubq_s32 (q_t2_i, q_t1_r);
541 q2_out_4567.val[0] = vhsubq_s32 (q_s5_r, q_s4_i);
542 q2_out_4567.val[1] = vhaddq_s32 (q_s5_i, q_s4_r);
543 q2_out_cdef.val[0] = vhaddq_s32 (q_s5_r, q_s4_i);
544 q2_out_cdef.val[1] = vhsubq_s32 (q_s5_i, q_s4_r);
550 #define RADIX8x4_START \ 551 ne10_int32_t f_count; \ 552 ne10_int32_t src_step = stride << 1; \ 553 const ne10_int32_t TW_81 = 1518500249; \ 554 const ne10_int32_t TW_81N = -1518500249; \ 555 int32_t *p_src, *p_dst; \ 556 int32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3, q2_in4, q2_in5, q2_in6, q2_in7; \ 557 int32x4_t q_sin0_r, q_sin0_i, q_sin1_r, q_sin1_i, q_sin2_r, q_sin2_i, q_sin3_r, q_sin3_i; \ 558 int32x4_t q_sin4_r, q_sin4_i, q_sin5_r, q_sin5_i, q_sin6_r, q_sin6_i, q_sin7_r, q_sin7_i; \ 559 int32x4_t q_s3_r, q_s3_i, q_s5_r, q_s5_i, q_s7_r, q_s7_i; \ 560 int32x4_t q_s8_r, q_s8_i, q_s9_r, q_s9_i, q_s10_r, q_s10_i, q_s11_r, q_s11_i; \ 561 int32x4_t q_s12_r, q_s12_i, q_s13_r, q_s13_i, q_s14_r, q_s14_i, q_s15_r, q_s15_i; \ 562 int32x4_t q_out0_r, q_out0_i, q_out1_r, q_out1_i, q_out2_r, q_out2_i, q_out3_r, q_out3_i; \ 563 int32x4_t q_out4_r, q_out4_i, q_out5_r, q_out5_i, q_out6_r, q_out6_i, q_out7_r, q_out7_i; \ 564 int32x4x2_t q2_tmp0, q2_tmp1, q2_tmp2, q2_tmp3, q2_tmp4, q2_tmp5, q2_tmp6, q2_tmp7; \ 565 int32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3, q2_out4, q2_out5, q2_out6, q2_out7; \ 566 int32x4_t q_tw_81, q_tw_81n; \ 567 p_src = (int32_t *) Fin; \ 568 p_dst = (int32_t *) Fout; 571 #define RADIX8x4_LOAD \ 572 q2_in0 = vld2q_s32 (p_src); \ 574 q2_in2 = vld2q_s32 (p_src); \ 576 q2_in4 = vld2q_s32 (p_src); \ 578 q2_in6 = vld2q_s32 (p_src); \ 580 q2_in1 = vld2q_s32 (p_src); \ 582 q2_in3 = vld2q_s32 (p_src); \ 584 q2_in5 = vld2q_s32 (p_src); \ 586 q2_in7 = vld2q_s32 (p_src); \ 589 #define RADIX8x4_STORE \ 590 q2_tmp0 = vtrnq_s32 (q_out0_r, q_out1_r); \ 591 q2_tmp1 = vtrnq_s32 (q_out0_i, q_out1_i); \ 592 q2_tmp2 = vtrnq_s32 (q_out2_r, q_out3_r); \ 593 q2_tmp3 = vtrnq_s32 (q_out2_i, q_out3_i); \ 594 q2_tmp4 = vtrnq_s32 (q_out4_r, q_out5_r); \ 595 q2_tmp5 = vtrnq_s32 (q_out4_i, q_out5_i); \ 596 q2_tmp6 = vtrnq_s32 (q_out6_r, q_out7_r); \ 597 q2_tmp7 = vtrnq_s32 (q_out6_i, q_out7_i); \ 598 q2_out0.val[0] = vcombine_s32 (vget_low_s32 (q2_tmp0.val[0]), vget_low_s32 (q2_tmp2.val[0])); \ 599 q2_out0.val[1] = vcombine_s32 (vget_low_s32 (q2_tmp1.val[0]), vget_low_s32 (q2_tmp3.val[0])); \ 600 q2_out2.val[0] = vcombine_s32 (vget_low_s32 (q2_tmp0.val[1]), vget_low_s32 (q2_tmp2.val[1])); \ 601 q2_out2.val[1] = vcombine_s32 (vget_low_s32 (q2_tmp1.val[1]), vget_low_s32 (q2_tmp3.val[1])); \ 602 q2_out4.val[0] = vcombine_s32 (vget_high_s32 (q2_tmp0.val[0]), vget_high_s32 (q2_tmp2.val[0])); \ 603 q2_out4.val[1] = vcombine_s32 (vget_high_s32 (q2_tmp1.val[0]), vget_high_s32 (q2_tmp3.val[0])); \ 604 q2_out6.val[0] = vcombine_s32 (vget_high_s32 (q2_tmp0.val[1]), vget_high_s32 (q2_tmp2.val[1])); \ 605 q2_out6.val[1] = vcombine_s32 (vget_high_s32 (q2_tmp1.val[1]), vget_high_s32 (q2_tmp3.val[1])); \ 606 q2_out1.val[0] = vcombine_s32 (vget_low_s32 (q2_tmp4.val[0]), vget_low_s32 (q2_tmp6.val[0])); \ 607 q2_out1.val[1] = vcombine_s32 (vget_low_s32 (q2_tmp5.val[0]), vget_low_s32 (q2_tmp7.val[0])); \ 608 q2_out3.val[0] = vcombine_s32 (vget_low_s32 (q2_tmp4.val[1]), vget_low_s32 (q2_tmp6.val[1])); \ 609 q2_out3.val[1] = vcombine_s32 (vget_low_s32 (q2_tmp5.val[1]), vget_low_s32 (q2_tmp7.val[1])); \ 610 q2_out5.val[0] = vcombine_s32 (vget_high_s32 (q2_tmp4.val[0]), vget_high_s32 (q2_tmp6.val[0])); \ 611 q2_out5.val[1] = vcombine_s32 (vget_high_s32 (q2_tmp5.val[0]), vget_high_s32 (q2_tmp7.val[0])); \ 612 q2_out7.val[0] = vcombine_s32 (vget_high_s32 (q2_tmp4.val[1]), vget_high_s32 (q2_tmp6.val[1])); \ 613 q2_out7.val[1] = vcombine_s32 (vget_high_s32 (q2_tmp5.val[1]), vget_high_s32 (q2_tmp7.val[1])); \ 614 vst2q_s32 (p_dst, q2_out0); \ 616 vst2q_s32 (p_dst, q2_out1); \ 618 vst2q_s32 (p_dst, q2_out2); \ 620 vst2q_s32 (p_dst, q2_out3); \ 622 vst2q_s32 (p_dst, q2_out4); \ 624 vst2q_s32 (p_dst, q2_out5); \ 626 vst2q_s32 (p_dst, q2_out6); \ 628 vst2q_s32 (p_dst, q2_out7); \ 630 p_src = p_src - src_step * 8 + 8; 632 #define RADIX8x4_FS_S0 \ 633 q_sin0_r = vaddq_s32 (q2_in0.val[0], q2_in1.val[0]); \ 634 q_sin0_i = vaddq_s32 (q2_in0.val[1], q2_in1.val[1]); \ 635 q_sin1_r = vsubq_s32 (q2_in0.val[0], q2_in1.val[0]); \ 636 q_sin1_i = vsubq_s32 (q2_in0.val[1], q2_in1.val[1]); \ 637 q_sin2_r = vaddq_s32 (q2_in2.val[0], q2_in3.val[0]); \ 638 q_sin2_i = vaddq_s32 (q2_in2.val[1], q2_in3.val[1]); \ 639 q_sin3_r = vsubq_s32 (q2_in2.val[0], q2_in3.val[0]); \ 640 q_sin3_i = vsubq_s32 (q2_in2.val[1], q2_in3.val[1]); \ 641 q_sin4_r = vaddq_s32 (q2_in4.val[0], q2_in5.val[0]); \ 642 q_sin4_i = vaddq_s32 (q2_in4.val[1], q2_in5.val[1]); \ 643 q_sin5_r = vsubq_s32 (q2_in4.val[0], q2_in5.val[0]); \ 644 q_sin5_i = vsubq_s32 (q2_in4.val[1], q2_in5.val[1]); \ 645 q_sin6_r = vaddq_s32 (q2_in6.val[0], q2_in7.val[0]); \ 646 q_sin6_i = vaddq_s32 (q2_in6.val[1], q2_in7.val[1]); \ 647 q_sin7_r = vsubq_s32 (q2_in6.val[0], q2_in7.val[0]); \ 648 q_sin7_i = vsubq_s32 (q2_in6.val[1], q2_in7.val[1]); 650 #define RADIX8x4_FWD_S357 \ 651 q_tw_81 = vdupq_n_s32 (TW_81); \ 652 q_tw_81n = vdupq_n_s32 (TW_81N); \ 654 q_s5_i = vnegq_s32 (q_sin5_r); \ 655 q_s3_r = vaddq_s32 (q_sin3_r, q_sin3_i); \ 656 q_s3_i = vsubq_s32 (q_sin3_i, q_sin3_r); \ 657 q_s7_r = vsubq_s32 (q_sin7_r, q_sin7_i); \ 658 q_s7_i = vaddq_s32 (q_sin7_i, q_sin7_r); \ 659 q_s3_r = vqdmulhq_s32 (q_s3_r, q_tw_81); \ 660 q_s3_i = vqdmulhq_s32 (q_s3_i, q_tw_81); \ 661 q_s7_r = vqdmulhq_s32 (q_s7_r, q_tw_81n); \ 662 q_s7_i = vqdmulhq_s32 (q_s7_i, q_tw_81n); 664 #define RADIX8x4_INV_S357 \ 665 q_tw_81 = vdupq_n_s32 (TW_81); \ 666 q_tw_81n = vdupq_n_s32 (TW_81N); \ 667 q_s5_r = vnegq_s32 (q_sin5_i); \ 669 q_s3_r = vsubq_s32 (q_sin3_r, q_sin3_i); \ 670 q_s3_i = vaddq_s32 (q_sin3_i, q_sin3_r); \ 671 q_s7_r = vaddq_s32 (q_sin7_r, q_sin7_i); \ 672 q_s7_i = vsubq_s32 (q_sin7_i, q_sin7_r); \ 673 q_s3_r = vqdmulhq_s32 (q_s3_r, q_tw_81); \ 674 q_s3_i = vqdmulhq_s32 (q_s3_i, q_tw_81); \ 675 q_s7_r = vqdmulhq_s32 (q_s7_r, q_tw_81n); \ 676 q_s7_i = vqdmulhq_s32 (q_s7_i, q_tw_81n); 678 #define RADIX8x4_LS_02 \ 679 q_s8_r = vaddq_s32 (q_sin0_r, q_sin4_r); \ 680 q_s8_i = vaddq_s32 (q_sin0_i, q_sin4_i); \ 681 q_s9_r = vaddq_s32 (q_sin1_r, q_s5_r); \ 682 q_s9_i = vaddq_s32 (q_sin1_i, q_s5_i); \ 683 q_s10_r = vsubq_s32 (q_sin0_r, q_sin4_r); \ 684 q_s10_i = vsubq_s32 (q_sin0_i, q_sin4_i); \ 685 q_s11_r = vsubq_s32 (q_sin1_r, q_s5_r); \ 686 q_s11_i = vsubq_s32 (q_sin1_i, q_s5_i); \ 687 q_s12_r = vaddq_s32 (q_sin2_r, q_sin6_r); \ 688 q_s12_i = vaddq_s32 (q_sin2_i, q_sin6_i); \ 689 q_s13_r = vaddq_s32 (q_s3_r, q_s7_r); \ 690 q_s13_i = vaddq_s32 (q_s3_i, q_s7_i); \ 691 q_s14_r = vsubq_s32 (q_sin2_r, q_sin6_r); \ 692 q_s14_i = vsubq_s32 (q_sin2_i, q_sin6_i); \ 693 q_s15_r = vsubq_s32 (q_s3_r, q_s7_r); \ 694 q_s15_i = vsubq_s32 (q_s3_i, q_s7_i); \ 695 q_out4_r = vsubq_s32 (q_s8_r, q_s12_r); \ 696 q_out4_i = vsubq_s32 (q_s8_i, q_s12_i); \ 697 q_out5_r = vsubq_s32 (q_s9_r, q_s13_r); \ 698 q_out5_i = vsubq_s32 (q_s9_i, q_s13_i); \ 699 q_out0_r = vaddq_s32 (q_s8_r, q_s12_r); \ 700 q_out0_i = vaddq_s32 (q_s8_i, q_s12_i); \ 701 q_out1_r = vaddq_s32 (q_s9_r, q_s13_r); \ 702 q_out1_i = vaddq_s32 (q_s9_i, q_s13_i); 704 #define RADIX8x4_FS_S0_SCALED \ 705 q_sin0_r = vhaddq_s32 (q2_in0.val[0], q2_in1.val[0]); \ 706 q_sin0_i = vhaddq_s32 (q2_in0.val[1], q2_in1.val[1]); \ 707 q_sin1_r = vhsubq_s32 (q2_in0.val[0], q2_in1.val[0]); \ 708 q_sin1_i = vhsubq_s32 (q2_in0.val[1], q2_in1.val[1]); \ 709 q_sin2_r = vhaddq_s32 (q2_in2.val[0], q2_in3.val[0]); \ 710 q_sin2_i = vhaddq_s32 (q2_in2.val[1], q2_in3.val[1]); \ 711 q_sin3_r = vhsubq_s32 (q2_in2.val[0], q2_in3.val[0]); \ 712 q_sin3_i = vhsubq_s32 (q2_in2.val[1], q2_in3.val[1]); \ 713 q_sin4_r = vhaddq_s32 (q2_in4.val[0], q2_in5.val[0]); \ 714 q_sin4_i = vhaddq_s32 (q2_in4.val[1], q2_in5.val[1]); \ 715 q_sin5_r = vhsubq_s32 (q2_in4.val[0], q2_in5.val[0]); \ 716 q_sin5_i = vhsubq_s32 (q2_in4.val[1], q2_in5.val[1]); \ 717 q_sin6_r = vhaddq_s32 (q2_in6.val[0], q2_in7.val[0]); \ 718 q_sin6_i = vhaddq_s32 (q2_in6.val[1], q2_in7.val[1]); \ 719 q_sin7_r = vhsubq_s32 (q2_in6.val[0], q2_in7.val[0]); \ 720 q_sin7_i = vhsubq_s32 (q2_in6.val[1], q2_in7.val[1]); 722 #define RADIX8x4_LS_02_SCALED \ 723 q_s8_r = vhaddq_s32 (q_sin0_r, q_sin4_r); \ 724 q_s8_i = vhaddq_s32 (q_sin0_i, q_sin4_i); \ 725 q_s9_r = vhaddq_s32 (q_sin1_r, q_s5_r); \ 726 q_s9_i = vhaddq_s32 (q_sin1_i, q_s5_i); \ 727 q_s10_r = vhsubq_s32 (q_sin0_r, q_sin4_r); \ 728 q_s10_i = vhsubq_s32 (q_sin0_i, q_sin4_i); \ 729 q_s11_r = vhsubq_s32 (q_sin1_r, q_s5_r); \ 730 q_s11_i = vhsubq_s32 (q_sin1_i, q_s5_i); \ 731 q_s12_r = vhaddq_s32 (q_sin2_r, q_sin6_r); \ 732 q_s12_i = vhaddq_s32 (q_sin2_i, q_sin6_i); \ 733 q_s13_r = vhaddq_s32 (q_s3_r, q_s7_r); \ 734 q_s13_i = vhaddq_s32 (q_s3_i, q_s7_i); \ 735 q_s14_r = vhsubq_s32 (q_sin2_r, q_sin6_r); \ 736 q_s14_i = vhsubq_s32 (q_sin2_i, q_sin6_i); \ 737 q_s15_r = vhsubq_s32 (q_s3_r, q_s7_r); \ 738 q_s15_i = vhsubq_s32 (q_s3_i, q_s7_i); \ 739 q_out4_r = vhsubq_s32 (q_s8_r, q_s12_r); \ 740 q_out4_i = vhsubq_s32 (q_s8_i, q_s12_i); \ 741 q_out5_r = vhsubq_s32 (q_s9_r, q_s13_r); \ 742 q_out5_i = vhsubq_s32 (q_s9_i, q_s13_i); \ 743 q_out0_r = vhaddq_s32 (q_s8_r, q_s12_r); \ 744 q_out0_i = vhaddq_s32 (q_s8_i, q_s12_i); \ 745 q_out1_r = vhaddq_s32 (q_s9_r, q_s13_r); \ 746 q_out1_i = vhaddq_s32 (q_s9_i, q_s13_i); 755 for (f_count = 0; f_count < stride; f_count += 4)
765 q_out2_r = vaddq_s32 (q_s10_r, q_s14_i);
766 q_out2_i = vsubq_s32 (q_s10_i, q_s14_r);
767 q_out3_r = vaddq_s32 (q_s11_r, q_s15_i);
768 q_out3_i = vsubq_s32 (q_s11_i, q_s15_r);
769 q_out6_r = vsubq_s32 (q_s10_r, q_s14_i);
770 q_out6_i = vaddq_s32 (q_s10_i, q_s14_r);
771 q_out7_r = vsubq_s32 (q_s11_r, q_s15_i);
772 q_out7_i = vaddq_s32 (q_s11_i, q_s15_r);
784 for (f_count = 0; f_count < stride; f_count += 4)
793 q_out2_r = vsubq_s32 (q_s10_r, q_s14_i);
794 q_out2_i = vaddq_s32 (q_s10_i, q_s14_r);
795 q_out3_r = vsubq_s32 (q_s11_r, q_s15_i);
796 q_out3_i = vaddq_s32 (q_s11_i, q_s15_r);
797 q_out6_r = vaddq_s32 (q_s10_r, q_s14_i);
798 q_out6_i = vsubq_s32 (q_s10_i, q_s14_r);
799 q_out7_r = vaddq_s32 (q_s11_r, q_s15_i);
800 q_out7_i = vsubq_s32 (q_s11_i, q_s15_r);
811 for (f_count = 0; f_count < stride; f_count += 4)
820 q_out2_r = vhaddq_s32 (q_s10_r, q_s14_i);
821 q_out2_i = vhsubq_s32 (q_s10_i, q_s14_r);
822 q_out3_r = vhaddq_s32 (q_s11_r, q_s15_i);
823 q_out3_i = vhsubq_s32 (q_s11_i, q_s15_r);
824 q_out6_r = vhsubq_s32 (q_s10_r, q_s14_i);
825 q_out6_i = vhaddq_s32 (q_s10_i, q_s14_r);
826 q_out7_r = vhsubq_s32 (q_s11_r, q_s15_i);
827 q_out7_i = vhaddq_s32 (q_s11_i, q_s15_r);
839 for (f_count = 0; f_count < stride; f_count += 4)
848 q_out2_r = vhsubq_s32 (q_s10_r, q_s14_i);
849 q_out2_i = vhaddq_s32 (q_s10_i, q_s14_r);
850 q_out3_r = vhsubq_s32 (q_s11_r, q_s15_i);
851 q_out3_i = vhaddq_s32 (q_s11_i, q_s15_r);
852 q_out6_r = vhaddq_s32 (q_s10_r, q_s14_i);
853 q_out6_i = vhsubq_s32 (q_s10_i, q_s14_r);
854 q_out7_r = vhaddq_s32 (q_s11_r, q_s15_i);
855 q_out7_i = vhsubq_s32 (q_s11_i, q_s15_r);
861 #define RADIX4x4_WITHOUT_TW_START \ 862 ne10_int32_t f_count; \ 863 ne10_int32_t src_step = stride << 1; \ 864 int32_t *p_src, *p_dst; \ 865 int32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3; \ 866 int32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i, q_s3_r, q_s3_i; \ 867 int32x4_t q_out0_r, q_out0_i, q_out1_r, q_out1_i, q_out2_r, q_out2_i, q_out3_r, q_out3_i; \ 868 int32x4x2_t q2_tmp0, q2_tmp1, q2_tmp2, q2_tmp3; \ 869 int32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3; \ 870 p_src = (int32_t *) Fin; \ 871 p_dst = (int32_t *) Fout; 873 #define RADIX4x4_WITHOUT_TW_LOAD \ 874 q2_in0 = vld2q_s32 (p_src); \ 876 q2_in1 = vld2q_s32 (p_src); \ 878 q2_in2 = vld2q_s32 (p_src); \ 880 q2_in3 = vld2q_s32 (p_src); \ 883 #define RADIX4x4_WITHOUT_TW_STORE \ 884 q2_tmp0 = vtrnq_s32 (q_out0_r, q_out1_r); \ 885 q2_tmp1 = vtrnq_s32 (q_out0_i, q_out1_i); \ 886 q2_tmp2 = vtrnq_s32 (q_out2_r, q_out3_r); \ 887 q2_tmp3 = vtrnq_s32 (q_out2_i, q_out3_i); \ 888 q2_out0.val[0] = vcombine_s32 (vget_low_s32 (q2_tmp0.val[0]), vget_low_s32 (q2_tmp2.val[0])); \ 889 q2_out0.val[1] = vcombine_s32 (vget_low_s32 (q2_tmp1.val[0]), vget_low_s32 (q2_tmp3.val[0])); \ 890 q2_out1.val[0] = vcombine_s32 (vget_low_s32 (q2_tmp0.val[1]), vget_low_s32 (q2_tmp2.val[1])); \ 891 q2_out1.val[1] = vcombine_s32 (vget_low_s32 (q2_tmp1.val[1]), vget_low_s32 (q2_tmp3.val[1])); \ 892 q2_out2.val[0] = vcombine_s32 (vget_high_s32 (q2_tmp0.val[0]), vget_high_s32 (q2_tmp2.val[0])); \ 893 q2_out2.val[1] = vcombine_s32 (vget_high_s32 (q2_tmp1.val[0]), vget_high_s32 (q2_tmp3.val[0])); \ 894 q2_out3.val[0] = vcombine_s32 (vget_high_s32 (q2_tmp0.val[1]), vget_high_s32 (q2_tmp2.val[1])); \ 895 q2_out3.val[1] = vcombine_s32 (vget_high_s32 (q2_tmp1.val[1]), vget_high_s32 (q2_tmp3.val[1])); \ 896 vst2q_s32 (p_dst, q2_out0); \ 898 vst2q_s32 (p_dst, q2_out1); \ 900 vst2q_s32 (p_dst, q2_out2); \ 902 vst2q_s32 (p_dst, q2_out3); \ 904 p_src = p_src - src_step * 4 + 8; 906 #define RADIX4x4_WITHOUT_TW_S0 \ 907 q_s0_r = vaddq_s32 (q2_in0.val[0], q2_in2.val[0]); \ 908 q_s0_i = vaddq_s32 (q2_in0.val[1], q2_in2.val[1]); \ 909 q_s1_r = vsubq_s32 (q2_in0.val[0], q2_in2.val[0]); \ 910 q_s1_i = vsubq_s32 (q2_in0.val[1], q2_in2.val[1]); \ 911 q_s2_r = vaddq_s32 (q2_in1.val[0], q2_in3.val[0]); \ 912 q_s2_i = vaddq_s32 (q2_in1.val[1], q2_in3.val[1]); \ 913 q_s3_r = vsubq_s32 (q2_in1.val[0], q2_in3.val[0]); \ 914 q_s3_i = vsubq_s32 (q2_in1.val[1], q2_in3.val[1]); \ 915 q_out2_r = vsubq_s32 (q_s0_r, q_s2_r); \ 916 q_out2_i = vsubq_s32 (q_s0_i, q_s2_i); \ 917 q_out0_r = vaddq_s32 (q_s0_r, q_s2_r); \ 918 q_out0_i = vaddq_s32 (q_s0_i, q_s2_i); 920 #define RADIX4x4_WITHOUT_TW_S0_SCALED \ 921 q_s0_r = vhaddq_s32 (q2_in0.val[0], q2_in2.val[0]); \ 922 q_s0_i = vhaddq_s32 (q2_in0.val[1], q2_in2.val[1]); \ 923 q_s1_r = vhsubq_s32 (q2_in0.val[0], q2_in2.val[0]); \ 924 q_s1_i = vhsubq_s32 (q2_in0.val[1], q2_in2.val[1]); \ 925 q_s2_r = vhaddq_s32 (q2_in1.val[0], q2_in3.val[0]); \ 926 q_s2_i = vhaddq_s32 (q2_in1.val[1], q2_in3.val[1]); \ 927 q_s3_r = vhsubq_s32 (q2_in1.val[0], q2_in3.val[0]); \ 928 q_s3_i = vhsubq_s32 (q2_in1.val[1], q2_in3.val[1]); \ 929 q_out2_r = vhsubq_s32 (q_s0_r, q_s2_r); \ 930 q_out2_i = vhsubq_s32 (q_s0_i, q_s2_i); \ 931 q_out0_r = vhaddq_s32 (q_s0_r, q_s2_r); \ 932 q_out0_i = vhaddq_s32 (q_s0_i, q_s2_i); 935 static inline void ne10_radix4x4_without_twiddles_forward_unscaled_neon (
ne10_fft_cpx_int32_t * Fout,
941 for (f_count = 0; f_count < stride; f_count += 4)
949 q_out1_r = vaddq_s32 (q_s1_r, q_s3_i);
950 q_out1_i = vsubq_s32 (q_s1_i, q_s3_r);
951 q_out3_r = vsubq_s32 (q_s1_r, q_s3_i);
952 q_out3_i = vaddq_s32 (q_s1_i, q_s3_r);
958 static inline void ne10_radix4x4_without_twiddles_backward_unscaled_neon (
ne10_fft_cpx_int32_t * Fout,
964 for (f_count = 0; f_count < stride; f_count += 4)
972 q_out1_r = vsubq_s32 (q_s1_r, q_s3_i);
973 q_out1_i = vaddq_s32 (q_s1_i, q_s3_r);
974 q_out3_r = vaddq_s32 (q_s1_r, q_s3_i);
975 q_out3_i = vsubq_s32 (q_s1_i, q_s3_r);
981 static inline void ne10_radix4x4_without_twiddles_forward_scaled_neon (
ne10_fft_cpx_int32_t * Fout,
987 for (f_count = 0; f_count < stride; f_count += 4)
995 q_out1_r = vhaddq_s32 (q_s1_r, q_s3_i);
996 q_out1_i = vhsubq_s32 (q_s1_i, q_s3_r);
997 q_out3_r = vhsubq_s32 (q_s1_r, q_s3_i);
998 q_out3_i = vhaddq_s32 (q_s1_i, q_s3_r);
1004 static inline void ne10_radix4x4_without_twiddles_backward_scaled_neon (
ne10_fft_cpx_int32_t * Fout,
1010 for (f_count = 0; f_count < stride; f_count += 4)
1018 q_out1_r = vhsubq_s32 (q_s1_r, q_s3_i);
1019 q_out1_i = vhaddq_s32 (q_s1_i, q_s3_r);
1020 q_out3_r = vhaddq_s32 (q_s1_r, q_s3_i);
1021 q_out3_i = vhsubq_s32 (q_s1_i, q_s3_r);
1027 #define RADIX4x4_WITH_TW_START \ 1028 ne10_int32_t m_count; \ 1029 ne10_int32_t src_step = src_stride << 1; \ 1030 ne10_int32_t dst_step = dst_stride << 1; \ 1031 ne10_int32_t tw_step = mstride << 1; \ 1032 int32_t *p_src, *p_dst, *p_tw; \ 1033 int32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3; \ 1034 int32x4x2_t q2_tw0, q2_tw1, q2_tw2; \ 1035 int32x4_t q_s1_r, q_s1_i, q_s2_r, q_s2_i, q_s3_r, q_s3_i; \ 1036 int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3, q_tmp4, q_tmp5; \ 1037 int32x4_t q_s4_r, q_s4_i, q_s5_r, q_s5_i, q_s6_r, q_s6_i, q_s7_r, q_s7_i; \ 1038 int32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3; \ 1039 p_src = (int32_t *) Fin; \ 1040 p_dst = (int32_t *) Fout; \ 1041 p_tw = (int32_t *) tw; 1043 #define RADIX4x4_WITH_TW_LOAD \ 1044 q2_in0 = vld2q_s32 (p_src); \ 1045 p_src += src_step; \ 1046 q2_in1 = vld2q_s32 (p_src); \ 1047 p_src += src_step; \ 1048 q2_in2 = vld2q_s32 (p_src); \ 1049 p_src += src_step; \ 1050 q2_in3 = vld2q_s32 (p_src); \ 1051 p_src += src_step; \ 1052 q2_tw0 = vld2q_s32 (p_tw); \ 1054 q2_tw1 = vld2q_s32 (p_tw); \ 1056 q2_tw2 = vld2q_s32 (p_tw); \ 1057 q_s1_r = vqdmulhq_s32 (q2_in1.val[0], q2_tw0.val[0]); \ 1058 q_s1_i = vqdmulhq_s32 (q2_in1.val[1], q2_tw0.val[0]); \ 1059 q_s2_r = vqdmulhq_s32 (q2_in2.val[0], q2_tw1.val[0]); \ 1060 q_s2_i = vqdmulhq_s32 (q2_in2.val[1], q2_tw1.val[0]); \ 1061 q_s3_r = vqdmulhq_s32 (q2_in3.val[0], q2_tw2.val[0]); \ 1062 q_s3_i = vqdmulhq_s32 (q2_in3.val[1], q2_tw2.val[0]); \ 1063 q_tmp0 = vqdmulhq_s32 (q2_in1.val[1], q2_tw0.val[1]); \ 1064 q_tmp1 = vqdmulhq_s32 (q2_in1.val[0], q2_tw0.val[1]); \ 1065 q_tmp2 = vqdmulhq_s32 (q2_in2.val[1], q2_tw1.val[1]); \ 1066 q_tmp3 = vqdmulhq_s32 (q2_in2.val[0], q2_tw1.val[1]); \ 1067 q_tmp4 = vqdmulhq_s32 (q2_in3.val[1], q2_tw2.val[1]); \ 1068 q_tmp5 = vqdmulhq_s32 (q2_in3.val[0], q2_tw2.val[1]); 1070 #define RADIX4x4_WITH_TW_STORE \ 1071 vst2q_s32 (p_dst, q2_out0); \ 1072 p_dst += dst_step; \ 1073 vst2q_s32 (p_dst, q2_out1); \ 1074 p_dst += dst_step; \ 1075 vst2q_s32 (p_dst, q2_out2); \ 1076 p_dst += dst_step; \ 1077 vst2q_s32 (p_dst, q2_out3); \ 1078 p_dst += dst_step; \ 1079 p_src = p_src - src_step * 4 + 8; \ 1080 p_dst = p_dst - dst_step * 4 + 8; \ 1081 p_tw = p_tw - tw_step * 2 + 8; 1083 #define RADIX4x4_WITH_TW_S1_FWD \ 1084 q_s1_r = vsubq_s32 (q_s1_r, q_tmp0); \ 1085 q_s1_i = vaddq_s32 (q_s1_i, q_tmp1); \ 1086 q_s2_r = vsubq_s32 (q_s2_r, q_tmp2); \ 1087 q_s2_i = vaddq_s32 (q_s2_i, q_tmp3); \ 1088 q_s3_r = vsubq_s32 (q_s3_r, q_tmp4); \ 1089 q_s3_i = vaddq_s32 (q_s3_i, q_tmp5); 1091 #define RADIX4x4_WITH_TW_S1_INV \ 1092 q_s1_r = vaddq_s32 (q_s1_r, q_tmp0); \ 1093 q_s1_i = vsubq_s32 (q_s1_i, q_tmp1); \ 1094 q_s2_r = vaddq_s32 (q_s2_r, q_tmp2); \ 1095 q_s2_i = vsubq_s32 (q_s2_i, q_tmp3); \ 1096 q_s3_r = vaddq_s32 (q_s3_r, q_tmp4); \ 1097 q_s3_i = vsubq_s32 (q_s3_i, q_tmp5); 1100 #define RADIX4x4_WITH_TW_LS_02 \ 1101 q_s4_r = vaddq_s32 (q2_in0.val[0], q_s2_r); \ 1102 q_s4_i = vaddq_s32 (q2_in0.val[1], q_s2_i); \ 1103 q_s5_r = vsubq_s32 (q2_in0.val[0], q_s2_r); \ 1104 q_s5_i = vsubq_s32 (q2_in0.val[1], q_s2_i); \ 1105 q_s6_r = vaddq_s32 (q_s1_r, q_s3_r); \ 1106 q_s6_i = vaddq_s32 (q_s1_i, q_s3_i); \ 1107 q_s7_r = vsubq_s32 (q_s1_r, q_s3_r); \ 1108 q_s7_i = vsubq_s32 (q_s1_i, q_s3_i); \ 1109 q2_out2.val[0] = vsubq_s32 (q_s4_r, q_s6_r); \ 1110 q2_out2.val[1] = vsubq_s32 (q_s4_i, q_s6_i); \ 1111 q2_out0.val[0] = vaddq_s32 (q_s4_r, q_s6_r); \ 1112 q2_out0.val[1] = vaddq_s32 (q_s4_i, q_s6_i); 1114 #define RADIX4x4_WITH_TW_LS_02_SCALED \ 1115 q_s4_r = vhaddq_s32 (q2_in0.val[0], q_s2_r); \ 1116 q_s4_i = vhaddq_s32 (q2_in0.val[1], q_s2_i); \ 1117 q_s5_r = vhsubq_s32 (q2_in0.val[0], q_s2_r); \ 1118 q_s5_i = vhsubq_s32 (q2_in0.val[1], q_s2_i); \ 1119 q_s6_r = vhaddq_s32 (q_s1_r, q_s3_r); \ 1120 q_s6_i = vhaddq_s32 (q_s1_i, q_s3_i); \ 1121 q_s7_r = vhsubq_s32 (q_s1_r, q_s3_r); \ 1122 q_s7_i = vhsubq_s32 (q_s1_i, q_s3_i); \ 1123 q2_out2.val[0] = vhsubq_s32 (q_s4_r, q_s6_r); \ 1124 q2_out2.val[1] = vhsubq_s32 (q_s4_i, q_s6_i); \ 1125 q2_out0.val[0] = vhaddq_s32 (q_s4_r, q_s6_r); \ 1126 q2_out0.val[1] = vhaddq_s32 (q_s4_i, q_s6_i); 1129 static inline void ne10_radix4x4_with_twiddles_forward_unscaled_neon (
ne10_fft_cpx_int32_t * Fout,
1138 for (m_count = 0; m_count < mstride; m_count += 4)
1146 q2_out1.val[0] = vaddq_s32 (q_s5_r, q_s7_i);
1147 q2_out1.val[1] = vsubq_s32 (q_s5_i, q_s7_r);
1148 q2_out3.val[0] = vsubq_s32 (q_s5_r, q_s7_i);
1149 q2_out3.val[1] = vaddq_s32 (q_s5_i, q_s7_r);
1157 static inline void ne10_radix4x4_with_twiddles_backward_unscaled_neon (
ne10_fft_cpx_int32_t * Fout,
1166 for (m_count = 0; m_count < mstride; m_count += 4)
1174 q2_out1.val[0] = vsubq_s32 (q_s5_r, q_s7_i);
1175 q2_out1.val[1] = vaddq_s32 (q_s5_i, q_s7_r);
1176 q2_out3.val[0] = vaddq_s32 (q_s5_r, q_s7_i);
1177 q2_out3.val[1] = vsubq_s32 (q_s5_i, q_s7_r);
1186 static inline void ne10_radix4x4_with_twiddles_forward_scaled_neon (
ne10_fft_cpx_int32_t * Fout,
1195 for (m_count = 0; m_count < mstride; m_count += 4)
1203 q2_out1.val[0] = vhaddq_s32 (q_s5_r, q_s7_i);
1204 q2_out1.val[1] = vhsubq_s32 (q_s5_i, q_s7_r);
1205 q2_out3.val[0] = vhsubq_s32 (q_s5_r, q_s7_i);
1206 q2_out3.val[1] = vhaddq_s32 (q_s5_i, q_s7_r);
1213 static inline void ne10_radix4x4_with_twiddles_backward_scaled_neon (
ne10_fft_cpx_int32_t * Fout,
1222 for (m_count = 0; m_count < mstride; m_count += 4)
1230 q2_out1.val[0] = vhsubq_s32 (q_s5_r, q_s7_i);
1231 q2_out1.val[1] = vhaddq_s32 (q_s5_i, q_s7_r);
1232 q2_out3.val[0] = vhaddq_s32 (q_s5_r, q_s7_i);
1233 q2_out3.val[1] = vhsubq_s32 (q_s5_i, q_s7_r);
1240 #define ne10_mixed_radix_fft_forward_int32_neon(scaled) \ 1241 void ne10_mixed_radix_fft_forward_int32_##scaled##_neon (ne10_fft_cpx_int32_t * Fout, \ 1242 ne10_fft_cpx_int32_t * Fin, \ 1243 ne10_int32_t * factors, \ 1244 ne10_fft_cpx_int32_t * twiddles, \ 1245 ne10_fft_cpx_int32_t * buffer) \ 1247 ne10_int32_t fstride, mstride, N; \ 1248 ne10_int32_t fstride1; \ 1249 ne10_int32_t f_count; \ 1250 ne10_int32_t stage_count; \ 1252 ne10_fft_cpx_int32_t *Fin1, *Fout1; \ 1253 ne10_fft_cpx_int32_t *Fout_ls = Fout; \ 1254 ne10_fft_cpx_int32_t *Ftmp; \ 1255 ne10_fft_cpx_int32_t *tw, *tw1; \ 1258 stage_count = factors[0]; \ 1259 fstride = factors[1]; \ 1260 mstride = factors[ (stage_count << 1) - 1 ]; \ 1261 N = factors[ stage_count << 1 ]; \ 1270 ne10_radix8x4_forward_##scaled##_neon (Fout, Fin, fstride);\ 1281 ne10_radix4x4_without_twiddles_forward_##scaled##_neon (Fout, Fin, fstride); \ 1292 for (; stage_count > 1 ; stage_count--) \ 1295 for (f_count = 0; f_count < fstride; f_count ++) \ 1297 Fout1 = & Fout[ f_count * mstride << 2 ]; \ 1299 ne10_radix4x4_with_twiddles_forward_##scaled##_neon (Fout1, Fin1, tw1, N, mstride, mstride); \ 1302 tw += mstride * 3; \ 1314 for (f_count = 0; f_count < fstride; f_count ++) \ 1317 ne10_radix4x4_with_twiddles_forward_##scaled##_neon (Fout1, Fin1, tw1, N, N, mstride); \ 1324 #define ne10_mixed_radix_fft_backward_int32_neon(scaled) \ 1325 void ne10_mixed_radix_fft_backward_int32_##scaled##_neon (ne10_fft_cpx_int32_t * Fout, \ 1326 ne10_fft_cpx_int32_t * Fin, \ 1327 ne10_int32_t * factors, \ 1328 ne10_fft_cpx_int32_t * twiddles, \ 1329 ne10_fft_cpx_int32_t * buffer) \ 1331 ne10_int32_t fstride, mstride, N; \ 1332 ne10_int32_t fstride1; \ 1333 ne10_int32_t f_count; \ 1334 ne10_int32_t stage_count; \ 1336 ne10_fft_cpx_int32_t *Fin1, *Fout1; \ 1337 ne10_fft_cpx_int32_t *Fout_ls = Fout; \ 1338 ne10_fft_cpx_int32_t *Ftmp; \ 1339 ne10_fft_cpx_int32_t *tw, *tw1; \ 1342 stage_count = factors[0]; \ 1343 fstride = factors[1]; \ 1344 mstride = factors[ (stage_count << 1) - 1 ]; \ 1345 N = factors[ stage_count << 1 ]; \ 1354 ne10_radix8x4_backward_##scaled##_neon (Fout, Fin, fstride);\ 1365 ne10_radix4x4_without_twiddles_backward_##scaled##_neon (Fout, Fin, fstride); \ 1376 for (; stage_count > 1 ; stage_count--) \ 1379 for (f_count = 0; f_count < fstride; f_count ++) \ 1381 Fout1 = & Fout[ f_count * mstride << 2 ]; \ 1383 ne10_radix4x4_with_twiddles_backward_##scaled##_neon (Fout1, Fin1, tw1, N, mstride, mstride); \ 1386 tw += mstride * 3; \ 1398 for (f_count = 0; f_count < fstride; f_count ++) \ 1401 ne10_radix4x4_with_twiddles_backward_##scaled##_neon (Fout1, Fin1, tw1, N, N, mstride); \ 1423 int32x4x2_t q2_fpk, q2_fpnk, q2_tw, q2_dst, q2_dst2;
1424 int32x4_t q_fpnk_r, q_fpnk_i;
1425 int32x4_t q_f1k_r, q_f1k_i, q_f2k_r, q_f2k_i;
1426 int32x4_t q_tw_r, q_tw_i;
1427 int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
1428 int32x4_t q_dst2_r, q_dst2_i;
1429 int32_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
1437 dst[0].r = tdc.
r + tdc.
i;
1438 dst[ncfft].r = tdc.
r - tdc.
i;
1439 dst[ncfft].i = dst[0].i = 0;
1445 for (k = 1; k <= count ; k += 4)
1447 p_src = (int32_t*) (& (src[k]));
1448 p_src2 = (int32_t*) (& (src[ncfft - k - 3]));
1449 p_twiddles = (int32_t*) (& (twiddles[k - 1]));
1450 p_dst = (int32_t*) (& (dst[k]));
1451 p_dst2 = (int32_t*) (& (dst[ncfft - k - 3]));
1453 q2_fpk = vld2q_s32 (p_src);
1454 q2_fpnk = vld2q_s32 (p_src2);
1456 q2_tw = vld2q_s32 (p_twiddles);
1457 q2_fpnk.val[0] = vrev64q_s32 (q2_fpnk.val[0]);
1458 q2_fpnk.val[1] = vrev64q_s32 (q2_fpnk.val[1]);
1459 q_fpnk_r = vcombine_s32 (vget_high_s32 (q2_fpnk.val[0]), vget_low_s32 (q2_fpnk.val[0]));
1460 q_fpnk_i = vcombine_s32 (vget_high_s32 (q2_fpnk.val[1]), vget_low_s32 (q2_fpnk.val[1]));
1461 q_fpnk_i = vnegq_s32 (q_fpnk_i);
1463 q_f1k_r = vhaddq_s32 (q2_fpk.val[0], q_fpnk_r);
1464 q_f1k_i = vhaddq_s32 (q2_fpk.val[1], q_fpnk_i);
1466 q_f2k_r = vhsubq_s32 (q2_fpk.val[0], q_fpnk_r);
1467 q_f2k_i = vhsubq_s32 (q2_fpk.val[1], q_fpnk_i);
1469 q_tmp0 = vqdmulhq_s32 (q_f2k_r, q2_tw.val[0]);
1470 q_tmp1 = vqdmulhq_s32 (q_f2k_i, q2_tw.val[1]);
1471 q_tmp2 = vqdmulhq_s32 (q_f2k_r, q2_tw.val[1]);
1472 q_tmp3 = vqdmulhq_s32 (q_f2k_i, q2_tw.val[0]);
1473 q_tw_r = vsubq_s32 (q_tmp0, q_tmp1);
1474 q_tw_i = vaddq_s32 (q_tmp2, q_tmp3);
1476 q_dst2_r = vhsubq_s32 (q_f1k_r, q_tw_r);
1477 q_dst2_i = vhsubq_s32 (q_tw_i, q_f1k_i);
1478 q2_dst.val[0] = vhaddq_s32 (q_f1k_r, q_tw_r);
1479 q2_dst.val[1] = vhaddq_s32 (q_f1k_i, q_tw_i);
1480 q_dst2_r = vrev64q_s32 (q_dst2_r);
1481 q_dst2_i = vrev64q_s32 (q_dst2_i);
1482 q2_dst2.val[0] = vcombine_s32 (vget_high_s32 (q_dst2_r), vget_low_s32 (q_dst2_r));
1483 q2_dst2.val[1] = vcombine_s32 (vget_high_s32 (q_dst2_i), vget_low_s32 (q_dst2_i));
1484 vst2q_s32 (p_dst, q2_dst);
1485 vst2q_s32 (p_dst2, q2_dst2);
1491 for (k = 1; k <= count ; k += 4)
1493 p_src = (int32_t*) (& (src[k]));
1494 p_src2 = (int32_t*) (& (src[ncfft - k - 3]));
1495 p_twiddles = (int32_t*) (& (twiddles[k - 1]));
1496 p_dst = (int32_t*) (& (dst[k]));
1497 p_dst2 = (int32_t*) (& (dst[ncfft - k - 3]));
1499 q2_fpk = vld2q_s32 (p_src);
1500 q2_fpnk = vld2q_s32 (p_src2);
1502 q2_tw = vld2q_s32 (p_twiddles);
1503 q2_fpnk.val[0] = vrev64q_s32 (q2_fpnk.val[0]);
1504 q2_fpnk.val[1] = vrev64q_s32 (q2_fpnk.val[1]);
1505 q_fpnk_r = vcombine_s32 (vget_high_s32 (q2_fpnk.val[0]), vget_low_s32 (q2_fpnk.val[0]));
1506 q_fpnk_i = vcombine_s32 (vget_high_s32 (q2_fpnk.val[1]), vget_low_s32 (q2_fpnk.val[1]));
1507 q_fpnk_i = vnegq_s32 (q_fpnk_i);
1509 q_f1k_r = vaddq_s32 (q2_fpk.val[0], q_fpnk_r);
1510 q_f1k_i = vaddq_s32 (q2_fpk.val[1], q_fpnk_i);
1512 q_f2k_r = vsubq_s32 (q2_fpk.val[0], q_fpnk_r);
1513 q_f2k_i = vsubq_s32 (q2_fpk.val[1], q_fpnk_i);
1515 q_tmp0 = vqdmulhq_s32 (q_f2k_r, q2_tw.val[0]);
1516 q_tmp1 = vqdmulhq_s32 (q_f2k_i, q2_tw.val[1]);
1517 q_tmp2 = vqdmulhq_s32 (q_f2k_r, q2_tw.val[1]);
1518 q_tmp3 = vqdmulhq_s32 (q_f2k_i, q2_tw.val[0]);
1519 q_tw_r = vsubq_s32 (q_tmp0, q_tmp1);
1520 q_tw_i = vaddq_s32 (q_tmp2, q_tmp3);
1522 q_dst2_r = vhsubq_s32 (q_f1k_r, q_tw_r);
1523 q_dst2_i = vhsubq_s32 (q_tw_i, q_f1k_i);
1524 q2_dst.val[0] = vhaddq_s32 (q_f1k_r, q_tw_r);
1525 q2_dst.val[1] = vhaddq_s32 (q_f1k_i, q_tw_i);
1526 q_dst2_r = vrev64q_s32 (q_dst2_r);
1527 q_dst2_i = vrev64q_s32 (q_dst2_i);
1528 q2_dst2.val[0] = vcombine_s32 (vget_high_s32 (q_dst2_r), vget_low_s32 (q_dst2_r));
1529 q2_dst2.val[1] = vcombine_s32 (vget_high_s32 (q_dst2_i), vget_low_s32 (q_dst2_i));
1530 vst2q_s32 (p_dst, q2_dst);
1531 vst2q_s32 (p_dst2, q2_dst2);
1539 for (k = 1; k <= ncfft / 2 ; ++k)
1542 fpnk.
r = src[ncfft - k].r;
1543 fpnk.
i = - src[ncfft - k].i;
1550 f1k.
r = fpk.
r + fpnk.
r;
1551 f1k.
i = fpk.
i + fpnk.
i;
1553 f2k.
r = fpk.
r - fpnk.
r;
1554 f2k.
i = fpk.
i - fpnk.
i;
1559 dst[k].r = (f1k.
r + tw.
r) >> 1;
1560 dst[k].i = (f1k.
i + tw.
i) >> 1;
1561 dst[ncfft - k].r = (f1k.
r - tw.
r) >> 1;
1562 dst[ncfft - k].i = (tw.
i - f1k.
i) >> 1;
1577 int32x4x2_t q2_fk, q2_fnkc, q2_tw, q2_dst, q2_dst2;
1578 int32x4_t q_fnkc_r, q_fnkc_i;
1579 int32x4_t q_fek_r, q_fek_i, q_fok_r, q_fok_i;
1580 int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
1581 int32x4_t q_dst2_r, q_dst2_i;
1582 int32_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
1585 dst[0].
r = src[0].
r + src[ncfft].
r;
1586 dst[0].
i = src[0].
r - src[ncfft].
r;
1593 for (k = 1; k <= count ; k += 4)
1595 p_src = (int32_t*) (& (src[k]));
1596 p_src2 = (int32_t*) (& (src[ncfft - k - 3]));
1597 p_twiddles = (int32_t*) (& (twiddles[k - 1]));
1598 p_dst = (int32_t*) (& (dst[k]));
1599 p_dst2 = (int32_t*) (& (dst[ncfft - k - 3]));
1601 q2_fk = vld2q_s32 (p_src);
1602 q2_fnkc = vld2q_s32 (p_src2);
1603 q2_tw = vld2q_s32 (p_twiddles);
1604 q2_fnkc.val[0] = vrev64q_s32 (q2_fnkc.val[0]);
1605 q2_fnkc.val[1] = vrev64q_s32 (q2_fnkc.val[1]);
1606 q_fnkc_r = vcombine_s32 (vget_high_s32 (q2_fnkc.val[0]), vget_low_s32 (q2_fnkc.val[0]));
1607 q_fnkc_i = vcombine_s32 (vget_high_s32 (q2_fnkc.val[1]), vget_low_s32 (q2_fnkc.val[1]));
1608 q_fnkc_i = vnegq_s32 (q_fnkc_i);
1610 q_fek_r = vhaddq_s32 (q2_fk.val[0], q_fnkc_r);
1611 q_fek_i = vhaddq_s32 (q2_fk.val[1], q_fnkc_i);
1612 q_tmp0 = vhsubq_s32 (q2_fk.val[0], q_fnkc_r);
1613 q_tmp1 = vhsubq_s32 (q2_fk.val[1], q_fnkc_i);
1615 q_fok_r = vqdmulhq_s32 (q_tmp0, q2_tw.val[0]);
1616 q_fok_i = vqdmulhq_s32 (q_tmp1, q2_tw.val[0]);
1617 q_tmp2 = vqdmulhq_s32 (q_tmp1, q2_tw.val[1]);
1618 q_tmp3 = vqdmulhq_s32 (q_tmp0, q2_tw.val[1]);
1619 q_fok_r = vaddq_s32 (q_fok_r, q_tmp2);
1620 q_fok_i = vsubq_s32 (q_fok_i, q_tmp3);
1622 q_dst2_r = vsubq_s32 (q_fek_r, q_fok_r);
1623 q_dst2_i = vsubq_s32 (q_fok_i, q_fek_i);
1624 q2_dst.val[0] = vaddq_s32 (q_fek_r, q_fok_r);
1625 q2_dst.val[1] = vaddq_s32 (q_fek_i, q_fok_i);
1626 q_dst2_r = vrev64q_s32 (q_dst2_r);
1627 q_dst2_i = vrev64q_s32 (q_dst2_i);
1628 q2_dst2.val[0] = vcombine_s32 (vget_high_s32 (q_dst2_r), vget_low_s32 (q_dst2_r));
1629 q2_dst2.val[1] = vcombine_s32 (vget_high_s32 (q_dst2_i), vget_low_s32 (q_dst2_i));
1630 vst2q_s32 (p_dst, q2_dst);
1631 vst2q_s32 (p_dst2, q2_dst2);
1638 for (k = 1; k <= count ; k += 4)
1640 p_src = (int32_t*) (& (src[k]));
1641 p_src2 = (int32_t*) (& (src[ncfft - k - 3]));
1642 p_twiddles = (int32_t*) (& (twiddles[k - 1]));
1643 p_dst = (int32_t*) (& (dst[k]));
1644 p_dst2 = (int32_t*) (& (dst[ncfft - k - 3]));
1646 q2_fk = vld2q_s32 (p_src);
1647 q2_fnkc = vld2q_s32 (p_src2);
1648 q2_tw = vld2q_s32 (p_twiddles);
1649 q2_fnkc.val[0] = vrev64q_s32 (q2_fnkc.val[0]);
1650 q2_fnkc.val[1] = vrev64q_s32 (q2_fnkc.val[1]);
1651 q_fnkc_r = vcombine_s32 (vget_high_s32 (q2_fnkc.val[0]), vget_low_s32 (q2_fnkc.val[0]));
1652 q_fnkc_i = vcombine_s32 (vget_high_s32 (q2_fnkc.val[1]), vget_low_s32 (q2_fnkc.val[1]));
1653 q_fnkc_i = vnegq_s32 (q_fnkc_i);
1655 q_fek_r = vaddq_s32 (q2_fk.val[0], q_fnkc_r);
1656 q_fek_i = vaddq_s32 (q2_fk.val[1], q_fnkc_i);
1657 q_tmp0 = vsubq_s32 (q2_fk.val[0], q_fnkc_r);
1658 q_tmp1 = vsubq_s32 (q2_fk.val[1], q_fnkc_i);
1660 q_fok_r = vqdmulhq_s32 (q_tmp0, q2_tw.val[0]);
1661 q_fok_i = vqdmulhq_s32 (q_tmp1, q2_tw.val[0]);
1662 q_tmp2 = vqdmulhq_s32 (q_tmp1, q2_tw.val[1]);
1663 q_tmp3 = vqdmulhq_s32 (q_tmp0, q2_tw.val[1]);
1664 q_fok_r = vaddq_s32 (q_fok_r, q_tmp2);
1665 q_fok_i = vsubq_s32 (q_fok_i, q_tmp3);
1667 q_dst2_r = vsubq_s32 (q_fek_r, q_fok_r);
1668 q_dst2_i = vsubq_s32 (q_fok_i, q_fek_i);
1669 q2_dst.val[0] = vaddq_s32 (q_fek_r, q_fok_r);
1670 q2_dst.val[1] = vaddq_s32 (q_fek_i, q_fok_i);
1671 q_dst2_r = vrev64q_s32 (q_dst2_r);
1672 q_dst2_i = vrev64q_s32 (q_dst2_i);
1673 q2_dst2.val[0] = vcombine_s32 (vget_high_s32 (q_dst2_r), vget_low_s32 (q_dst2_r));
1674 q2_dst2.val[1] = vcombine_s32 (vget_high_s32 (q_dst2_i), vget_low_s32 (q_dst2_i));
1675 vst2q_s32 (p_dst, q2_dst);
1676 vst2q_s32 (p_dst2, q2_dst2);
1684 for (k = 1; k <= ncfft / 2; k++)
1687 fnkc.
r = src[ncfft - k].
r;
1688 fnkc.
i = -src[ncfft - k].
i;
1695 fek.
r = fk.
r + fnkc.
r;
1696 fek.
i = fk.
i + fnkc.
i;
1698 tmp.
r = fk.
r - fnkc.
r;
1699 tmp.
i = fk.
i - fnkc.
i;
1704 dst[k].
r = fek.
r + fok.
r;
1705 dst[k].
i = fek.
i + fok.
i;
1707 dst[ncfft - k].
r = fek.
r - fok.
r;
1708 dst[ncfft - k].
i = fok.
i - fek.
i;
1762 ne10_fft4_backward_int32_scaled (fout, fin);
1765 ne10_fft8_backward_int32_scaled (fout, fin);
1768 ne10_fft16_backward_int32_scaled_neon (fout, fin, cfg->
twiddles);
1780 ne10_fft4_forward_int32_scaled (fout, fin);
1783 ne10_fft8_forward_int32_scaled (fout, fin);
1786 ne10_fft16_forward_int32_scaled_neon (fout, fin, cfg->
twiddles);
1801 ne10_fft4_backward_int32_unscaled (fout, fin);
1804 ne10_fft8_backward_int32_unscaled (fout, fin);
1807 ne10_fft16_backward_int32_unscaled_neon (fout, fin, cfg->
twiddles);
1819 ne10_fft4_forward_int32_unscaled (fout, fin);
1822 ne10_fft8_forward_int32_unscaled (fout, fin);
1825 ne10_fft16_forward_int32_unscaled_neon (fout, fin, cfg->
twiddles);
1851 c2c_state.
buffer = tmpbuf2;
1854 ne10_fft_split_r2c_1d_int32_neon (fout, tmpbuf1, cfg->
super_twiddles, cfg->
ncfft, scaled_flag);
1873 c2c_state.
buffer = tmpbuf2;
1875 ne10_fft_split_c2r_1d_int32_neon (tmpbuf1, fin, cfg->
super_twiddles, cfg->
ncfft, scaled_flag);
#define NE10_FFT_ALG_DEFAULT
void ne10_fft_c2c_1d_int32_neon(ne10_fft_cpx_int32_t *fout, ne10_fft_cpx_int32_t *fin, ne10_fft_cfg_int32_t cfg, ne10_int32_t inverse_fft, ne10_int32_t scaled_flag)
Specific implementation of ne10_fft_c2c_1d_int32 using NEON SIMD capabilities.
#define RADIX8x4_LS_02_SCALED
ne10_fft_cpx_int32_t * twiddles
void ne10_mixed_radix_fft_backward_int32_scaled_neon(ne10_fft_cpx_int32_t *Fout, ne10_fft_cpx_int32_t *fin, ne10_int32_t *factors, ne10_fft_cpx_int32_t *twiddles, ne10_fft_cpx_int32_t *buffer) asm("ne10_mixed_radix_fft_backward_int32_scaled_neon")
void ne10_fft_c2r_1d_int32_neon(ne10_int32_t *fout, ne10_fft_cpx_int32_t *fin, ne10_fft_r2c_cfg_int32_t cfg, ne10_int32_t scaled_flag)
Specific implementation of ne10_fft_c2r_1d_int32 using NEON SIMD capabilities.
#define RADIX4x4_WITH_TW_START
void ne10_mixed_radix_fft_forward_int32_unscaled_neon(ne10_fft_cpx_int32_t *Fout, ne10_fft_cpx_int32_t *fin, ne10_int32_t *factors, ne10_fft_cpx_int32_t *twiddles, ne10_fft_cpx_int32_t *buffer) asm("ne10_mixed_radix_fft_forward_int32_unscaled_neon")
#define RADIX4x4_WITH_TW_LS_02
#define RADIX8x4_INV_S357
#define RADIX4x4_WITHOUT_TW_S0
#define NE10_F2I32_SAMPPROD
#define RADIX4x4_WITH_TW_LOAD
#define RADIX8x4_FS_S0_SCALED
#define RADIX4x4_WITHOUT_TW_START
#define NE10_F2I32_FIXDIV(c, div)
void ne10_fft_r2c_1d_int32_neon(ne10_fft_cpx_int32_t *fout, ne10_int32_t *fin, ne10_fft_r2c_cfg_int32_t cfg, ne10_int32_t scaled_flag)
Specific implementation of ne10_fft_r2c_1d_int32 using NEON SIMD capabilities.
void ne10_fft_c2c_1d_int32_c(ne10_fft_cpx_int32_t *fout, ne10_fft_cpx_int32_t *fin, ne10_fft_cfg_int32_t cfg, ne10_int32_t inverse_fft, ne10_int32_t scaled_flag)
Specific implementation of ne10_fft_c2c_1d_int32 using plain C.
void ne10_mixed_radix_fft_backward_int32_unscaled_neon(ne10_fft_cpx_int32_t *Fout, ne10_fft_cpx_int32_t *fin, ne10_int32_t *factors, ne10_fft_cpx_int32_t *twiddles, ne10_fft_cpx_int32_t *buffer) asm("ne10_mixed_radix_fft_backward_int32_unscaled_neon")
Structure for the 32-bit fixed point FFT function.
#define RADIX4x4_WITHOUT_TW_S0_SCALED
#define RADIX4x4_WITHOUT_TW_STORE
#define ne10_mixed_radix_fft_forward_int32_neon(scaled)
#define RADIX4x4_WITH_TW_STORE
void ne10_mixed_radix_generic_butterfly_inverse_int32_neon(ne10_fft_cpx_int32_t *Fout, const ne10_fft_cpx_int32_t *Fin, const ne10_int32_t *factors, const ne10_fft_cpx_int32_t *twiddles, ne10_fft_cpx_int32_t *buffer, const ne10_int32_t scaled_flag)
#define RADIX4x4_WITHOUT_TW_LOAD
#define FFT16_LS_02_SCALED
ne10_fft_cpx_int32_t * twiddles
void ne10_mixed_radix_generic_butterfly_int32_neon(ne10_fft_cpx_int32_t *Fout, const ne10_fft_cpx_int32_t *Fin, const ne10_int32_t *factors, const ne10_fft_cpx_int32_t *twiddles, ne10_fft_cpx_int32_t *buffer, const ne10_int32_t scaled_flag)
ne10_fft_cpx_int32_t * buffer
ne10_fft_cpx_int32_t * buffer
#define RADIX4x4_WITH_TW_LS_02_SCALED
#define RADIX8x4_FWD_S357
#define RADIX4x4_WITH_TW_S1_FWD
#define RADIX4x4_WITH_TW_S1_INV
#define ne10_mixed_radix_fft_backward_int32_neon(scaled)
ne10_fft_cpx_int32_t * super_twiddles
void ne10_mixed_radix_fft_forward_int32_scaled_neon(ne10_fft_cpx_int32_t *Fout, ne10_fft_cpx_int32_t *fin, ne10_int32_t *factors, ne10_fft_cpx_int32_t *twiddles, ne10_fft_cpx_int32_t *buffer) asm("ne10_mixed_radix_fft_forward_int32_scaled_neon")