42 Fout[0].
r = Fin[0].
r + Fin[1].
r;
43 Fout[0].
i = Fin[0].
i + Fin[1].
i;
44 Fout[1].
r = Fin[0].
r - Fin[1].
r;
45 Fout[1].
i = Fin[0].
i - Fin[1].
i;
52 Fout[0].
r = Fin[0].
r + Fin[1].
r;
53 Fout[0].
i = Fin[0].
i + Fin[1].
i;
54 Fout[1].
r = Fin[0].
r - Fin[1].
r;
55 Fout[1].
i = Fin[0].
i - Fin[1].
i;
62 Fout[0].
r = (Fin[0].
r + Fin[1].
r) >> 1;
63 Fout[0].
i = (Fin[0].
i + Fin[1].
i) >> 1;
64 Fout[1].
r = (Fin[0].
r - Fin[1].
r) >> 1;
65 Fout[1].
i = (Fin[0].
i - Fin[1].
i) >> 1;
72 Fout[0].
r = (Fin[0].
r + Fin[1].
r) >> 1;
73 Fout[0].
i = (Fin[0].
i + Fin[1].
i) >> 1;
74 Fout[1].
r = (Fin[0].
r - Fin[1].
r) >> 1;
75 Fout[1].
i = (Fin[0].
i - Fin[1].
i) >> 1;
78 #define FFT4_FS_START \ 79 ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i; \ 80 ne10_int16_t tmp_r, tmp_i; 83 s2_r = Fin[0].r - Fin[2].r; \ 84 s2_i = Fin[0].i - Fin[2].i; \ 85 tmp_r = Fin[0].r + Fin[2].r; \ 86 tmp_i = Fin[0].i + Fin[2].i; \ 87 s0_r = Fin[1].r + Fin[3].r; \ 88 s0_i = Fin[1].i + Fin[3].i; \ 89 s1_r = Fin[1].r - Fin[3].r; \ 90 s1_i = Fin[1].i - Fin[3].i; 92 #define FFT4_FS_SCALED \ 93 s2_r = (Fin[0].r - Fin[2].r) >> 2; \ 94 s2_i = (Fin[0].i - Fin[2].i) >> 2; \ 95 tmp_r = (Fin[0].r + Fin[2].r) >> 2; \ 96 tmp_i = (Fin[0].i + Fin[2].i) >> 2; \ 97 s0_r = (Fin[1].r + Fin[3].r) >> 2; \ 98 s0_i = (Fin[1].i + Fin[3].i) >> 2; \ 99 s1_r = (Fin[1].r - Fin[3].r) >> 2; \ 100 s1_i = (Fin[1].i - Fin[3].i) >> 2; 102 #define FFT4_FWD_LS \ 103 Fout[2].r = tmp_r - s0_r; \ 104 Fout[2].i = tmp_i - s0_i; \ 105 Fout[0].r = tmp_r + s0_r; \ 106 Fout[0].i = tmp_i + s0_i; \ 107 Fout[1].r = s2_r + s1_i; \ 108 Fout[1].i = s2_i - s1_r; \ 109 Fout[3].r = s2_r - s1_i; \ 110 Fout[3].i = s2_i + s1_r; 112 #define FFT4_INV_LS \ 113 Fout[2].r = tmp_r - s0_r; \ 114 Fout[2].i = tmp_i - s0_i; \ 115 Fout[0].r = tmp_r + s0_r; \ 116 Fout[0].i = tmp_i + s0_i; \ 117 Fout[1].r = s2_r - s1_i; \ 118 Fout[1].i = s2_i + s1_r; \ 119 Fout[3].r = s2_r + s1_i; \ 120 Fout[3].i = s2_i - s1_r; 157 #define FFT8_FS_START \ 158 ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i; \ 159 ne10_int16_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i; \ 160 const ne10_int16_t TW_81 = 23169; 163 s0_r = Fin[0].r + Fin[4].r; \ 164 s0_i = Fin[0].i + Fin[4].i; \ 165 s1_r = Fin[0].r - Fin[4].r; \ 166 s1_i = Fin[0].i - Fin[4].i; \ 167 s2_r = Fin[1].r + Fin[5].r; \ 168 s2_i = Fin[1].i + Fin[5].i; \ 169 s3_r = Fin[1].r - Fin[5].r; \ 170 s3_i = Fin[1].i - Fin[5].i; \ 171 s4_r = Fin[2].r + Fin[6].r; \ 172 s4_i = Fin[2].i + Fin[6].i; \ 173 s5_r = Fin[2].r - Fin[6].r; \ 174 s5_i = Fin[2].i - Fin[6].i; \ 175 s6_r = Fin[3].r + Fin[7].r; \ 176 s6_i = Fin[3].i + Fin[7].i; \ 177 s7_r = Fin[3].r - Fin[7].r; \ 178 s7_i = Fin[3].i - Fin[7].i; 180 #define FFT8_FS_SCALED \ 181 s0_r = (Fin[0].r + Fin[4].r) >> 3; \ 182 s0_i = (Fin[0].i + Fin[4].i) >> 3; \ 183 s1_r = (Fin[0].r - Fin[4].r) >> 3; \ 184 s1_i = (Fin[0].i - Fin[4].i) >> 3; \ 185 s2_r = (Fin[1].r + Fin[5].r) >> 3; \ 186 s2_i = (Fin[1].i + Fin[5].i) >> 3; \ 187 s3_r = (Fin[1].r - Fin[5].r) >> 3; \ 188 s3_i = (Fin[1].i - Fin[5].i) >> 3; \ 189 s4_r = (Fin[2].r + Fin[6].r) >> 3; \ 190 s4_i = (Fin[2].i + Fin[6].i) >> 3; \ 191 s5_r = (Fin[2].r - Fin[6].r) >> 3; \ 192 s5_i = (Fin[2].i - Fin[6].i) >> 3; \ 193 s6_r = (Fin[3].r + Fin[7].r) >> 3; \ 194 s6_i = (Fin[3].i + Fin[7].i) >> 3; \ 195 s7_r = (Fin[3].r - Fin[7].r) >> 3; \ 196 s7_i = (Fin[3].i - Fin[7].i) >> 3; 199 #define FFT8_FWD_LS \ 200 t0_r = s0_r - s4_r; \ 201 t0_i = s0_i - s4_i; \ 202 t1_r = s0_r + s4_r; \ 203 t1_i = s0_i + s4_i; \ 204 t2_r = s2_r + s6_r; \ 205 t2_i = s2_i + s6_i; \ 206 t3_r = s2_r - s6_r; \ 207 t3_i = s2_i - s6_i; \ 208 Fout[0].r = t1_r + t2_r; \ 209 Fout[0].i = t1_i + t2_i; \ 210 Fout[4].r = t1_r - t2_r; \ 211 Fout[4].i = t1_i - t2_i; \ 212 Fout[2].r = t0_r + t3_i; \ 213 Fout[2].i = t0_i - t3_r; \ 214 Fout[6].r = t0_r - t3_i; \ 215 Fout[6].i = t0_i + t3_r; \ 216 t4_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r + s3_i) * TW_81) >> NE10_F2I16_SHIFT); \ 217 t4_i = - (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r - s3_i) * TW_81) >> NE10_F2I16_SHIFT); \ 218 t5_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r - s7_i) * TW_81) >> NE10_F2I16_SHIFT); \ 219 t5_i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r + s7_i) * TW_81) >> NE10_F2I16_SHIFT); \ 220 t0_r = s1_r - s5_i; \ 221 t0_i = s1_i + s5_r; \ 222 t1_r = s1_r + s5_i; \ 223 t1_i = s1_i - s5_r; \ 224 t2_r = t4_r - t5_r; \ 225 t2_i = t4_i - t5_i; \ 226 t3_r = t4_r + t5_r; \ 227 t3_i = t4_i + t5_i; \ 228 Fout[1].r = t1_r + t2_r; \ 229 Fout[1].i = t1_i + t2_i; \ 230 Fout[5].r = t1_r - t2_r; \ 231 Fout[5].i = t1_i - t2_i; \ 232 Fout[3].r = t0_r + t3_i; \ 233 Fout[3].i = t0_i - t3_r; \ 234 Fout[7].r = t0_r - t3_i; \ 235 Fout[7].i = t0_i + t3_r; 237 #define FFT8_INV_LS \ 238 t0_r = s0_r - s4_r; \ 239 t0_i = s0_i - s4_i; \ 240 t1_r = s0_r + s4_r; \ 241 t1_i = s0_i + s4_i; \ 242 t2_r = s2_r + s6_r; \ 243 t2_i = s2_i + s6_i; \ 244 t3_r = s2_r - s6_r; \ 245 t3_i = s2_i - s6_i; \ 246 Fout[0].r = t1_r + t2_r; \ 247 Fout[0].i = t1_i + t2_i; \ 248 Fout[4].r = t1_r - t2_r; \ 249 Fout[4].i = t1_i - t2_i; \ 250 Fout[2].r = t0_r - t3_i; \ 251 Fout[2].i = t0_i + t3_r; \ 252 Fout[6].r = t0_r + t3_i; \ 253 Fout[6].i = t0_i - t3_r; \ 254 t4_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r - s3_i) * TW_81) >> NE10_F2I16_SHIFT); \ 255 t4_i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r + s3_i) * TW_81) >> NE10_F2I16_SHIFT); \ 256 t5_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r + s7_i) * TW_81) >> NE10_F2I16_SHIFT); \ 257 t5_i = - (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r - s7_i) * TW_81) >> NE10_F2I16_SHIFT); \ 258 t0_r = s1_r + s5_i; \ 259 t0_i = s1_i - s5_r; \ 260 t1_r = s1_r - s5_i; \ 261 t1_i = s1_i + s5_r; \ 262 t2_r = t4_r - t5_r; \ 263 t2_i = t4_i - t5_i; \ 264 t3_r = t4_r + t5_r; \ 265 t3_i = t4_i + t5_i; \ 266 Fout[1].r = t1_r + t2_r; \ 267 Fout[1].i = t1_i + t2_i; \ 268 Fout[5].r = t1_r - t2_r; \ 269 Fout[5].i = t1_i - t2_i; \ 270 Fout[3].r = t0_r - t3_i; \ 271 Fout[3].i = t0_i + t3_r; \ 272 Fout[7].r = t0_r + t3_i; \ 273 Fout[7].i = t0_i - t3_r; 310 #define RADIX8x4_START \ 311 ne10_int32_t f_count; \ 312 ne10_int32_t src_step = stride << 1; \ 313 const ne10_int16_t TW_81 = 23169; \ 314 const ne10_int16_t TW_81N = -23169; \ 315 int16_t *p_src, *p_dst; \ 316 int16x4x2_t d2_in0, d2_in1, d2_in2, d2_in3, d2_in4, d2_in5, d2_in6, d2_in7; \ 317 int16x4_t d_sin0_r, d_sin0_i, d_sin1_r, d_sin1_i, d_sin2_r, d_sin2_i, d_sin3_r, d_sin3_i; \ 318 int16x4_t d_sin4_r, d_sin4_i, d_sin5_r, d_sin5_i, d_sin6_r, d_sin6_i, d_sin7_r, d_sin7_i; \ 319 int16x4_t d_s3_r, d_s3_i, d_s5_r, d_s5_i, d_s7_r, d_s7_i; \ 320 int16x4_t d_s8_r, d_s8_i, d_s9_r, d_s9_i, d_s10_r, d_s10_i, d_s11_r, d_s11_i; \ 321 int16x4_t d_s12_r, d_s12_i, d_s13_r, d_s13_i, d_s14_r, d_s14_i, d_s15_r, d_s15_i; \ 322 int16x4_t d_out0_r, d_out0_i, d_out1_r, d_out1_i, d_out2_r, d_out2_i, d_out3_r, d_out3_i; \ 323 int16x4_t d_out4_r, d_out4_i, d_out5_r, d_out5_i, d_out6_r, d_out6_i, d_out7_r, d_out7_i; \ 324 int16x4x2_t d2_out0, d2_out1, d2_out2, d2_out3, d2_out4, d2_out5, d2_out6, d2_out7; \ 325 int16x8x2_t q2_tmp0, q2_tmp1, q2_tmp2, q2_tmp3; \ 326 int32x4x2_t q2_tmp4, q2_tmp5, q2_tmp6, q2_tmp7; \ 327 int16x4_t d_tw_81, d_tw_81n; \ 328 p_src = (int16_t *) Fin; \ 329 p_dst = (int16_t *) Fout; 332 #define RADIX8x4_LOAD \ 333 d2_in0 = vld2_s16 (p_src); \ 335 d2_in2 = vld2_s16 (p_src); \ 337 d2_in4 = vld2_s16 (p_src); \ 339 d2_in6 = vld2_s16 (p_src); \ 341 d2_in1 = vld2_s16 (p_src); \ 343 d2_in3 = vld2_s16 (p_src); \ 345 d2_in5 = vld2_s16 (p_src); \ 347 d2_in7 = vld2_s16 (p_src); \ 350 #define RADIX8x4_STORE \ 351 q2_tmp0 = vtrnq_s16 (vcombine_s16(d_out0_r, d_out0_i), vcombine_s16(d_out1_r, d_out1_i)); \ 352 q2_tmp1 = vtrnq_s16 (vcombine_s16(d_out2_r, d_out2_i), vcombine_s16(d_out3_r, d_out3_i)); \ 353 q2_tmp2 = vtrnq_s16 (vcombine_s16(d_out4_r, d_out4_i), vcombine_s16(d_out5_r, d_out5_i)); \ 354 q2_tmp3 = vtrnq_s16 (vcombine_s16(d_out6_r, d_out6_i), vcombine_s16(d_out7_r, d_out7_i)); \ 355 q2_tmp4 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp0.val[0]), vreinterpretq_s32_s16(q2_tmp1.val[0])); \ 356 q2_tmp5 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp0.val[1]), vreinterpretq_s32_s16(q2_tmp1.val[1])); \ 357 q2_tmp6 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp2.val[0]), vreinterpretq_s32_s16(q2_tmp3.val[0])); \ 358 q2_tmp7 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp2.val[1]), vreinterpretq_s32_s16(q2_tmp3.val[1])); \ 359 d2_out0.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp4.val[0])); \ 360 d2_out0.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp4.val[0])); \ 361 d2_out1.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp6.val[0])); \ 362 d2_out1.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp6.val[0])); \ 363 d2_out2.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp5.val[0])); \ 364 d2_out2.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp5.val[0])); \ 365 d2_out3.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp7.val[0])); \ 366 d2_out3.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp7.val[0])); \ 367 d2_out4.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp4.val[1])); \ 368 d2_out4.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp4.val[1])); \ 369 d2_out5.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp6.val[1])); \ 370 d2_out5.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp6.val[1])); \ 371 d2_out6.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp5.val[1])); \ 372 d2_out6.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp5.val[1])); \ 373 d2_out7.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp7.val[1])); \ 374 d2_out7.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp7.val[1])); \ 375 vst2_s16 (p_dst, d2_out0); \ 377 vst2_s16 (p_dst, d2_out1); \ 379 vst2_s16 (p_dst, d2_out2); \ 381 vst2_s16 (p_dst, d2_out3); \ 383 vst2_s16 (p_dst, d2_out4); \ 385 vst2_s16 (p_dst, d2_out5); \ 387 vst2_s16 (p_dst, d2_out6); \ 389 vst2_s16 (p_dst, d2_out7); \ 391 p_src = p_src - src_step * 8 + 8; 393 #define RADIX8x4_FS_S0 \ 394 d_sin0_r = vadd_s16 (d2_in0.val[0], d2_in1.val[0]); \ 395 d_sin0_i = vadd_s16 (d2_in0.val[1], d2_in1.val[1]); \ 396 d_sin1_r = vsub_s16 (d2_in0.val[0], d2_in1.val[0]); \ 397 d_sin1_i = vsub_s16 (d2_in0.val[1], d2_in1.val[1]); \ 398 d_sin2_r = vadd_s16 (d2_in2.val[0], d2_in3.val[0]); \ 399 d_sin2_i = vadd_s16 (d2_in2.val[1], d2_in3.val[1]); \ 400 d_sin3_r = vsub_s16 (d2_in2.val[0], d2_in3.val[0]); \ 401 d_sin3_i = vsub_s16 (d2_in2.val[1], d2_in3.val[1]); \ 402 d_sin4_r = vadd_s16 (d2_in4.val[0], d2_in5.val[0]); \ 403 d_sin4_i = vadd_s16 (d2_in4.val[1], d2_in5.val[1]); \ 404 d_sin5_r = vsub_s16 (d2_in4.val[0], d2_in5.val[0]); \ 405 d_sin5_i = vsub_s16 (d2_in4.val[1], d2_in5.val[1]); \ 406 d_sin6_r = vadd_s16 (d2_in6.val[0], d2_in7.val[0]); \ 407 d_sin6_i = vadd_s16 (d2_in6.val[1], d2_in7.val[1]); \ 408 d_sin7_r = vsub_s16 (d2_in6.val[0], d2_in7.val[0]); \ 409 d_sin7_i = vsub_s16 (d2_in6.val[1], d2_in7.val[1]); 411 #define RADIX8x4_FWD_S357 \ 412 d_tw_81 = vdup_n_s16 (TW_81); \ 413 d_tw_81n = vdup_n_s16 (TW_81N); \ 415 d_s5_i = vneg_s16 (d_sin5_r); \ 416 d_s3_r = vadd_s16 (d_sin3_r, d_sin3_i); \ 417 d_s3_i = vsub_s16 (d_sin3_i, d_sin3_r); \ 418 d_s7_r = vsub_s16 (d_sin7_r, d_sin7_i); \ 419 d_s7_i = vadd_s16 (d_sin7_i, d_sin7_r); \ 420 d_s3_r = vqdmulh_s16 (d_s3_r, d_tw_81); \ 421 d_s3_i = vqdmulh_s16 (d_s3_i, d_tw_81); \ 422 d_s7_r = vqdmulh_s16 (d_s7_r, d_tw_81n); \ 423 d_s7_i = vqdmulh_s16 (d_s7_i, d_tw_81n); 425 #define RADIX8x4_INV_S357 \ 426 d_tw_81 = vdup_n_s16 (TW_81); \ 427 d_tw_81n = vdup_n_s16 (TW_81N); \ 428 d_s5_r = vneg_s16 (d_sin5_i); \ 430 d_s3_r = vsub_s16 (d_sin3_r, d_sin3_i); \ 431 d_s3_i = vadd_s16 (d_sin3_i, d_sin3_r); \ 432 d_s7_r = vadd_s16 (d_sin7_r, d_sin7_i); \ 433 d_s7_i = vsub_s16 (d_sin7_i, d_sin7_r); \ 434 d_s3_r = vqdmulh_s16 (d_s3_r, d_tw_81); \ 435 d_s3_i = vqdmulh_s16 (d_s3_i, d_tw_81); \ 436 d_s7_r = vqdmulh_s16 (d_s7_r, d_tw_81n); \ 437 d_s7_i = vqdmulh_s16 (d_s7_i, d_tw_81n); 439 #define RADIX8x4_LS_02 \ 440 d_s8_r = vadd_s16 (d_sin0_r, d_sin4_r); \ 441 d_s8_i = vadd_s16 (d_sin0_i, d_sin4_i); \ 442 d_s9_r = vadd_s16 (d_sin1_r, d_s5_r); \ 443 d_s9_i = vadd_s16 (d_sin1_i, d_s5_i); \ 444 d_s10_r = vsub_s16 (d_sin0_r, d_sin4_r); \ 445 d_s10_i = vsub_s16 (d_sin0_i, d_sin4_i); \ 446 d_s11_r = vsub_s16 (d_sin1_r, d_s5_r); \ 447 d_s11_i = vsub_s16 (d_sin1_i, d_s5_i); \ 448 d_s12_r = vadd_s16 (d_sin2_r, d_sin6_r); \ 449 d_s12_i = vadd_s16 (d_sin2_i, d_sin6_i); \ 450 d_s13_r = vadd_s16 (d_s3_r, d_s7_r); \ 451 d_s13_i = vadd_s16 (d_s3_i, d_s7_i); \ 452 d_s14_r = vsub_s16 (d_sin2_r, d_sin6_r); \ 453 d_s14_i = vsub_s16 (d_sin2_i, d_sin6_i); \ 454 d_s15_r = vsub_s16 (d_s3_r, d_s7_r); \ 455 d_s15_i = vsub_s16 (d_s3_i, d_s7_i); \ 456 d_out4_r = vsub_s16 (d_s8_r, d_s12_r); \ 457 d_out4_i = vsub_s16 (d_s8_i, d_s12_i); \ 458 d_out5_r = vsub_s16 (d_s9_r, d_s13_r); \ 459 d_out5_i = vsub_s16 (d_s9_i, d_s13_i); \ 460 d_out0_r = vadd_s16 (d_s8_r, d_s12_r); \ 461 d_out0_i = vadd_s16 (d_s8_i, d_s12_i); \ 462 d_out1_r = vadd_s16 (d_s9_r, d_s13_r); \ 463 d_out1_i = vadd_s16 (d_s9_i, d_s13_i); 465 #define RADIX8x4_FS_S0_SCALED \ 466 d_sin0_r = vhadd_s16 (d2_in0.val[0], d2_in1.val[0]); \ 467 d_sin0_i = vhadd_s16 (d2_in0.val[1], d2_in1.val[1]); \ 468 d_sin1_r = vhsub_s16 (d2_in0.val[0], d2_in1.val[0]); \ 469 d_sin1_i = vhsub_s16 (d2_in0.val[1], d2_in1.val[1]); \ 470 d_sin2_r = vhadd_s16 (d2_in2.val[0], d2_in3.val[0]); \ 471 d_sin2_i = vhadd_s16 (d2_in2.val[1], d2_in3.val[1]); \ 472 d_sin3_r = vhsub_s16 (d2_in2.val[0], d2_in3.val[0]); \ 473 d_sin3_i = vhsub_s16 (d2_in2.val[1], d2_in3.val[1]); \ 474 d_sin4_r = vhadd_s16 (d2_in4.val[0], d2_in5.val[0]); \ 475 d_sin4_i = vhadd_s16 (d2_in4.val[1], d2_in5.val[1]); \ 476 d_sin5_r = vhsub_s16 (d2_in4.val[0], d2_in5.val[0]); \ 477 d_sin5_i = vhsub_s16 (d2_in4.val[1], d2_in5.val[1]); \ 478 d_sin6_r = vhadd_s16 (d2_in6.val[0], d2_in7.val[0]); \ 479 d_sin6_i = vhadd_s16 (d2_in6.val[1], d2_in7.val[1]); \ 480 d_sin7_r = vhsub_s16 (d2_in6.val[0], d2_in7.val[0]); \ 481 d_sin7_i = vhsub_s16 (d2_in6.val[1], d2_in7.val[1]); 483 #define RADIX8x4_LS_02_SCALED \ 484 d_s8_r = vhadd_s16 (d_sin0_r, d_sin4_r); \ 485 d_s8_i = vhadd_s16 (d_sin0_i, d_sin4_i); \ 486 d_s9_r = vhadd_s16 (d_sin1_r, d_s5_r); \ 487 d_s9_i = vhadd_s16 (d_sin1_i, d_s5_i); \ 488 d_s10_r = vhsub_s16 (d_sin0_r, d_sin4_r); \ 489 d_s10_i = vhsub_s16 (d_sin0_i, d_sin4_i); \ 490 d_s11_r = vhsub_s16 (d_sin1_r, d_s5_r); \ 491 d_s11_i = vhsub_s16 (d_sin1_i, d_s5_i); \ 492 d_s12_r = vhadd_s16 (d_sin2_r, d_sin6_r); \ 493 d_s12_i = vhadd_s16 (d_sin2_i, d_sin6_i); \ 494 d_s13_r = vhadd_s16 (d_s3_r, d_s7_r); \ 495 d_s13_i = vhadd_s16 (d_s3_i, d_s7_i); \ 496 d_s14_r = vhsub_s16 (d_sin2_r, d_sin6_r); \ 497 d_s14_i = vhsub_s16 (d_sin2_i, d_sin6_i); \ 498 d_s15_r = vhsub_s16 (d_s3_r, d_s7_r); \ 499 d_s15_i = vhsub_s16 (d_s3_i, d_s7_i); \ 500 d_out4_r = vhsub_s16 (d_s8_r, d_s12_r); \ 501 d_out4_i = vhsub_s16 (d_s8_i, d_s12_i); \ 502 d_out5_r = vhsub_s16 (d_s9_r, d_s13_r); \ 503 d_out5_i = vhsub_s16 (d_s9_i, d_s13_i); \ 504 d_out0_r = vhadd_s16 (d_s8_r, d_s12_r); \ 505 d_out0_i = vhadd_s16 (d_s8_i, d_s12_i); \ 506 d_out1_r = vhadd_s16 (d_s9_r, d_s13_r); \ 507 d_out1_i = vhadd_s16 (d_s9_i, d_s13_i); 516 for (f_count = 0; f_count < stride; f_count += 4)
526 d_out2_r = vadd_s16 (d_s10_r, d_s14_i);
527 d_out2_i = vsub_s16 (d_s10_i, d_s14_r);
528 d_out3_r = vadd_s16 (d_s11_r, d_s15_i);
529 d_out3_i = vsub_s16 (d_s11_i, d_s15_r);
530 d_out6_r = vsub_s16 (d_s10_r, d_s14_i);
531 d_out6_i = vadd_s16 (d_s10_i, d_s14_r);
532 d_out7_r = vsub_s16 (d_s11_r, d_s15_i);
533 d_out7_i = vadd_s16 (d_s11_i, d_s15_r);
545 for (f_count = 0; f_count < stride; f_count += 4)
554 d_out2_r = vsub_s16 (d_s10_r, d_s14_i);
555 d_out2_i = vadd_s16 (d_s10_i, d_s14_r);
556 d_out3_r = vsub_s16 (d_s11_r, d_s15_i);
557 d_out3_i = vadd_s16 (d_s11_i, d_s15_r);
558 d_out6_r = vadd_s16 (d_s10_r, d_s14_i);
559 d_out6_i = vsub_s16 (d_s10_i, d_s14_r);
560 d_out7_r = vadd_s16 (d_s11_r, d_s15_i);
561 d_out7_i = vsub_s16 (d_s11_i, d_s15_r);
572 for (f_count = 0; f_count < stride; f_count += 4)
581 d_out2_r = vhadd_s16 (d_s10_r, d_s14_i);
582 d_out2_i = vhsub_s16 (d_s10_i, d_s14_r);
583 d_out3_r = vhadd_s16 (d_s11_r, d_s15_i);
584 d_out3_i = vhsub_s16 (d_s11_i, d_s15_r);
585 d_out6_r = vhsub_s16 (d_s10_r, d_s14_i);
586 d_out6_i = vhadd_s16 (d_s10_i, d_s14_r);
587 d_out7_r = vhsub_s16 (d_s11_r, d_s15_i);
588 d_out7_i = vhadd_s16 (d_s11_i, d_s15_r);
600 for (f_count = 0; f_count < stride; f_count += 4)
609 d_out2_r = vhsub_s16 (d_s10_r, d_s14_i);
610 d_out2_i = vhadd_s16 (d_s10_i, d_s14_r);
611 d_out3_r = vhsub_s16 (d_s11_r, d_s15_i);
612 d_out3_i = vhadd_s16 (d_s11_i, d_s15_r);
613 d_out6_r = vhadd_s16 (d_s10_r, d_s14_i);
614 d_out6_i = vhsub_s16 (d_s10_i, d_s14_r);
615 d_out7_r = vhadd_s16 (d_s11_r, d_s15_i);
616 d_out7_i = vhsub_s16 (d_s11_i, d_s15_r);
622 #define RADIX4x4_WITHOUT_TW_START \ 623 ne10_int32_t f_count; \ 624 ne10_int32_t src_step = stride << 1; \ 625 int16_t *p_src, *p_dst; \ 626 int16x4x2_t d2_in0, d2_in1, d2_in2, d2_in3; \ 627 int16x4_t d_s0_r, d_s0_i, d_s1_r, d_s1_i, d_s2_r, d_s2_i, d_s3_r, d_s3_i; \ 628 int16x4_t d_out0_r, d_out0_i, d_out1_r, d_out1_i, d_out2_r, d_out2_i, d_out3_r, d_out3_i; \ 629 int16x4x2_t d2_out0, d2_out1, d2_out2, d2_out3; \ 630 int16x8x2_t q2_tmp0, q2_tmp1; \ 631 int32x4x2_t q2_tmp2, q2_tmp3; \ 632 p_src = (int16_t *) Fin; \ 633 p_dst = (int16_t *) Fout; 635 #define RADIX4x4_WITHOUT_TW_LOAD \ 636 d2_in0 = vld2_s16 (p_src); \ 638 d2_in1 = vld2_s16 (p_src); \ 640 d2_in2 = vld2_s16 (p_src); \ 642 d2_in3 = vld2_s16 (p_src); \ 645 #define RADIX4x4_WITHOUT_TW_STORE \ 646 q2_tmp0 = vtrnq_s16 (vcombine_s16(d_out0_r, d_out0_i), vcombine_s16(d_out1_r, d_out1_i)); \ 647 q2_tmp1 = vtrnq_s16 (vcombine_s16(d_out2_r, d_out2_i), vcombine_s16(d_out3_r, d_out3_i)); \ 648 q2_tmp2 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp0.val[0]), vreinterpretq_s32_s16(q2_tmp1.val[0])); \ 649 q2_tmp3 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp0.val[1]), vreinterpretq_s32_s16(q2_tmp1.val[1])); \ 650 d2_out0.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp2.val[0])); \ 651 d2_out0.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp2.val[0])); \ 652 d2_out1.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp3.val[0])); \ 653 d2_out1.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp3.val[0])); \ 654 d2_out2.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp2.val[1])); \ 655 d2_out2.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp2.val[1])); \ 656 d2_out3.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp3.val[1])); \ 657 d2_out3.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp3.val[1])); \ 658 vst2_s16 (p_dst, d2_out0); \ 660 vst2_s16 (p_dst, d2_out1); \ 662 vst2_s16 (p_dst, d2_out2); \ 664 vst2_s16 (p_dst, d2_out3); \ 666 p_src = p_src - src_step * 4 + 8; 668 #define RADIX4x4_WITHOUT_TW_S0 \ 669 d_s0_r = vadd_s16 (d2_in0.val[0], d2_in2.val[0]); \ 670 d_s0_i = vadd_s16 (d2_in0.val[1], d2_in2.val[1]); \ 671 d_s1_r = vsub_s16 (d2_in0.val[0], d2_in2.val[0]); \ 672 d_s1_i = vsub_s16 (d2_in0.val[1], d2_in2.val[1]); \ 673 d_s2_r = vadd_s16 (d2_in1.val[0], d2_in3.val[0]); \ 674 d_s2_i = vadd_s16 (d2_in1.val[1], d2_in3.val[1]); \ 675 d_s3_r = vsub_s16 (d2_in1.val[0], d2_in3.val[0]); \ 676 d_s3_i = vsub_s16 (d2_in1.val[1], d2_in3.val[1]); \ 677 d_out2_r = vsub_s16 (d_s0_r, d_s2_r); \ 678 d_out2_i = vsub_s16 (d_s0_i, d_s2_i); \ 679 d_out0_r = vadd_s16 (d_s0_r, d_s2_r); \ 680 d_out0_i = vadd_s16 (d_s0_i, d_s2_i); 682 #define RADIX4x4_WITHOUT_TW_S0_SCALED \ 683 d_s0_r = vhadd_s16 (d2_in0.val[0], d2_in2.val[0]); \ 684 d_s0_i = vhadd_s16 (d2_in0.val[1], d2_in2.val[1]); \ 685 d_s1_r = vhsub_s16 (d2_in0.val[0], d2_in2.val[0]); \ 686 d_s1_i = vhsub_s16 (d2_in0.val[1], d2_in2.val[1]); \ 687 d_s2_r = vhadd_s16 (d2_in1.val[0], d2_in3.val[0]); \ 688 d_s2_i = vhadd_s16 (d2_in1.val[1], d2_in3.val[1]); \ 689 d_s3_r = vhsub_s16 (d2_in1.val[0], d2_in3.val[0]); \ 690 d_s3_i = vhsub_s16 (d2_in1.val[1], d2_in3.val[1]); \ 691 d_out2_r = vhsub_s16 (d_s0_r, d_s2_r); \ 692 d_out2_i = vhsub_s16 (d_s0_i, d_s2_i); \ 693 d_out0_r = vhadd_s16 (d_s0_r, d_s2_r); \ 694 d_out0_i = vhadd_s16 (d_s0_i, d_s2_i); 697 static inline void ne10_radix4x4_without_twiddles_forward_unscaled_neon (
ne10_fft_cpx_int16_t * Fout,
703 for (f_count = 0; f_count < stride; f_count += 4)
711 d_out1_r = vadd_s16 (d_s1_r, d_s3_i);
712 d_out1_i = vsub_s16 (d_s1_i, d_s3_r);
713 d_out3_r = vsub_s16 (d_s1_r, d_s3_i);
714 d_out3_i = vadd_s16 (d_s1_i, d_s3_r);
720 static inline void ne10_radix4x4_without_twiddles_backward_unscaled_neon (
ne10_fft_cpx_int16_t * Fout,
726 for (f_count = 0; f_count < stride; f_count += 4)
734 d_out1_r = vsub_s16 (d_s1_r, d_s3_i);
735 d_out1_i = vadd_s16 (d_s1_i, d_s3_r);
736 d_out3_r = vadd_s16 (d_s1_r, d_s3_i);
737 d_out3_i = vsub_s16 (d_s1_i, d_s3_r);
743 static inline void ne10_radix4x4_without_twiddles_forward_scaled_neon (
ne10_fft_cpx_int16_t * Fout,
749 for (f_count = 0; f_count < stride; f_count += 4)
757 d_out1_r = vhadd_s16 (d_s1_r, d_s3_i);
758 d_out1_i = vhsub_s16 (d_s1_i, d_s3_r);
759 d_out3_r = vhsub_s16 (d_s1_r, d_s3_i);
760 d_out3_i = vhadd_s16 (d_s1_i, d_s3_r);
766 static inline void ne10_radix4x4_without_twiddles_backward_scaled_neon (
ne10_fft_cpx_int16_t * Fout,
772 for (f_count = 0; f_count < stride; f_count += 4)
780 d_out1_r = vhsub_s16 (d_s1_r, d_s3_i);
781 d_out1_i = vhadd_s16 (d_s1_i, d_s3_r);
782 d_out3_r = vhadd_s16 (d_s1_r, d_s3_i);
783 d_out3_i = vhsub_s16 (d_s1_i, d_s3_r);
789 #define RADIX4x4_WITH_TW_START \ 790 ne10_int32_t m_count; \ 791 ne10_int32_t src_step = src_stride << 1; \ 792 ne10_int32_t dst_step = dst_stride << 1; \ 793 ne10_int32_t tw_step = mstride << 1; \ 794 int16_t *p_src, *p_dst, *p_tw; \ 795 int16x4x2_t d2_in0, d2_in1, d2_in2, d2_in3; \ 796 int16x4x2_t d2_tw0, d2_tw1, d2_tw2; \ 797 int16x4_t d_s1_r, d_s1_i, d_s2_r, d_s2_i, d_s3_r, d_s3_i; \ 798 int16x4_t d_tmp0, d_tmp1, d_tmp2, d_tmp3, d_tmp4, d_tmp5; \ 799 int16x4_t d_s4_r, d_s4_i, d_s5_r, d_s5_i, d_s6_r, d_s6_i, d_s7_r, d_s7_i; \ 800 int16x4x2_t d2_out0, d2_out1, d2_out2, d2_out3; \ 801 p_src = (int16_t *) Fin; \ 802 p_dst = (int16_t *) Fout; \ 803 p_tw = (int16_t *) tw; 805 #define RADIX4x4_WITH_TW_LOAD \ 806 d2_in0 = vld2_s16 (p_src); \ 808 d2_in1 = vld2_s16 (p_src); \ 810 d2_in2 = vld2_s16 (p_src); \ 812 d2_in3 = vld2_s16 (p_src); \ 814 d2_tw0 = vld2_s16 (p_tw); \ 816 d2_tw1 = vld2_s16 (p_tw); \ 818 d2_tw2 = vld2_s16 (p_tw); \ 819 d_s1_r = vqdmulh_s16 (d2_in1.val[0], d2_tw0.val[0]); \ 820 d_s1_i = vqdmulh_s16 (d2_in1.val[1], d2_tw0.val[0]); \ 821 d_s2_r = vqdmulh_s16 (d2_in2.val[0], d2_tw1.val[0]); \ 822 d_s2_i = vqdmulh_s16 (d2_in2.val[1], d2_tw1.val[0]); \ 823 d_s3_r = vqdmulh_s16 (d2_in3.val[0], d2_tw2.val[0]); \ 824 d_s3_i = vqdmulh_s16 (d2_in3.val[1], d2_tw2.val[0]); \ 825 d_tmp0 = vqdmulh_s16 (d2_in1.val[1], d2_tw0.val[1]); \ 826 d_tmp1 = vqdmulh_s16 (d2_in1.val[0], d2_tw0.val[1]); \ 827 d_tmp2 = vqdmulh_s16 (d2_in2.val[1], d2_tw1.val[1]); \ 828 d_tmp3 = vqdmulh_s16 (d2_in2.val[0], d2_tw1.val[1]); \ 829 d_tmp4 = vqdmulh_s16 (d2_in3.val[1], d2_tw2.val[1]); \ 830 d_tmp5 = vqdmulh_s16 (d2_in3.val[0], d2_tw2.val[1]); 832 #define RADIX4x4_WITH_TW_STORE \ 833 vst2_s16 (p_dst, d2_out0); \ 835 vst2_s16 (p_dst, d2_out1); \ 837 vst2_s16 (p_dst, d2_out2); \ 839 vst2_s16 (p_dst, d2_out3); \ 841 p_src = p_src - src_step * 4 + 8; \ 842 p_dst = p_dst - dst_step * 4 + 8; \ 843 p_tw = p_tw - tw_step * 2 + 8; 845 #define RADIX4x4_WITH_TW_S1_FWD \ 846 d_s1_r = vsub_s16 (d_s1_r, d_tmp0); \ 847 d_s1_i = vadd_s16 (d_s1_i, d_tmp1); \ 848 d_s2_r = vsub_s16 (d_s2_r, d_tmp2); \ 849 d_s2_i = vadd_s16 (d_s2_i, d_tmp3); \ 850 d_s3_r = vsub_s16 (d_s3_r, d_tmp4); \ 851 d_s3_i = vadd_s16 (d_s3_i, d_tmp5); 853 #define RADIX4x4_WITH_TW_S1_INV \ 854 d_s1_r = vadd_s16 (d_s1_r, d_tmp0); \ 855 d_s1_i = vsub_s16 (d_s1_i, d_tmp1); \ 856 d_s2_r = vadd_s16 (d_s2_r, d_tmp2); \ 857 d_s2_i = vsub_s16 (d_s2_i, d_tmp3); \ 858 d_s3_r = vadd_s16 (d_s3_r, d_tmp4); \ 859 d_s3_i = vsub_s16 (d_s3_i, d_tmp5); 862 #define RADIX4x4_WITH_TW_LS_02 \ 863 d_s4_r = vadd_s16 (d2_in0.val[0], d_s2_r); \ 864 d_s4_i = vadd_s16 (d2_in0.val[1], d_s2_i); \ 865 d_s5_r = vsub_s16 (d2_in0.val[0], d_s2_r); \ 866 d_s5_i = vsub_s16 (d2_in0.val[1], d_s2_i); \ 867 d_s6_r = vadd_s16 (d_s1_r, d_s3_r); \ 868 d_s6_i = vadd_s16 (d_s1_i, d_s3_i); \ 869 d_s7_r = vsub_s16 (d_s1_r, d_s3_r); \ 870 d_s7_i = vsub_s16 (d_s1_i, d_s3_i); \ 871 d2_out2.val[0] = vsub_s16 (d_s4_r, d_s6_r); \ 872 d2_out2.val[1] = vsub_s16 (d_s4_i, d_s6_i); \ 873 d2_out0.val[0] = vadd_s16 (d_s4_r, d_s6_r); \ 874 d2_out0.val[1] = vadd_s16 (d_s4_i, d_s6_i); 876 #define RADIX4x4_WITH_TW_LS_02_SCALED \ 877 d_s4_r = vhadd_s16 (d2_in0.val[0], d_s2_r); \ 878 d_s4_i = vhadd_s16 (d2_in0.val[1], d_s2_i); \ 879 d_s5_r = vhsub_s16 (d2_in0.val[0], d_s2_r); \ 880 d_s5_i = vhsub_s16 (d2_in0.val[1], d_s2_i); \ 881 d_s6_r = vhadd_s16 (d_s1_r, d_s3_r); \ 882 d_s6_i = vhadd_s16 (d_s1_i, d_s3_i); \ 883 d_s7_r = vhsub_s16 (d_s1_r, d_s3_r); \ 884 d_s7_i = vhsub_s16 (d_s1_i, d_s3_i); \ 885 d2_out2.val[0] = vhsub_s16 (d_s4_r, d_s6_r); \ 886 d2_out2.val[1] = vhsub_s16 (d_s4_i, d_s6_i); \ 887 d2_out0.val[0] = vhadd_s16 (d_s4_r, d_s6_r); \ 888 d2_out0.val[1] = vhadd_s16 (d_s4_i, d_s6_i); 891 static inline void ne10_radix4x4_with_twiddles_forward_unscaled_neon (
ne10_fft_cpx_int16_t * Fout,
900 for (m_count = 0; m_count < mstride; m_count += 4)
908 d2_out1.val[0] = vadd_s16 (d_s5_r, d_s7_i);
909 d2_out1.val[1] = vsub_s16 (d_s5_i, d_s7_r);
910 d2_out3.val[0] = vsub_s16 (d_s5_r, d_s7_i);
911 d2_out3.val[1] = vadd_s16 (d_s5_i, d_s7_r);
919 static inline void ne10_radix4x4_with_twiddles_backward_unscaled_neon (
ne10_fft_cpx_int16_t * Fout,
928 for (m_count = 0; m_count < mstride; m_count += 4)
936 d2_out1.val[0] = vsub_s16 (d_s5_r, d_s7_i);
937 d2_out1.val[1] = vadd_s16 (d_s5_i, d_s7_r);
938 d2_out3.val[0] = vadd_s16 (d_s5_r, d_s7_i);
939 d2_out3.val[1] = vsub_s16 (d_s5_i, d_s7_r);
948 static inline void ne10_radix4x4_with_twiddles_forward_scaled_neon (
ne10_fft_cpx_int16_t * Fout,
957 for (m_count = 0; m_count < mstride; m_count += 4)
965 d2_out1.val[0] = vhadd_s16 (d_s5_r, d_s7_i);
966 d2_out1.val[1] = vhsub_s16 (d_s5_i, d_s7_r);
967 d2_out3.val[0] = vhsub_s16 (d_s5_r, d_s7_i);
968 d2_out3.val[1] = vhadd_s16 (d_s5_i, d_s7_r);
975 static inline void ne10_radix4x4_with_twiddles_backward_scaled_neon (
ne10_fft_cpx_int16_t * Fout,
984 for (m_count = 0; m_count < mstride; m_count += 4)
992 d2_out1.val[0] = vhsub_s16 (d_s5_r, d_s7_i);
993 d2_out1.val[1] = vhadd_s16 (d_s5_i, d_s7_r);
994 d2_out3.val[0] = vhadd_s16 (d_s5_r, d_s7_i);
995 d2_out3.val[1] = vhsub_s16 (d_s5_i, d_s7_r);
1003 #define ne10_mixed_radix_fft_forward_int16_neon(scaled) \ 1004 void ne10_mixed_radix_fft_forward_int16_##scaled##_neon (ne10_fft_cpx_int16_t * Fout, \ 1005 ne10_fft_cpx_int16_t * Fin, \ 1006 ne10_int32_t * factors, \ 1007 ne10_fft_cpx_int16_t * twiddles, \ 1008 ne10_fft_cpx_int16_t * buffer) \ 1010 ne10_int32_t fstride, mstride, N; \ 1011 ne10_int32_t fstride1; \ 1012 ne10_int32_t f_count; \ 1013 ne10_int32_t stage_count; \ 1015 ne10_fft_cpx_int16_t *Fin1, *Fout1; \ 1016 ne10_fft_cpx_int16_t *Fout_ls = Fout; \ 1017 ne10_fft_cpx_int16_t *Ftmp; \ 1018 ne10_fft_cpx_int16_t *tw, *tw1; \ 1021 stage_count = factors[0]; \ 1022 fstride = factors[1]; \ 1023 mstride = factors[ (stage_count << 1) - 1 ]; \ 1024 N = factors[ stage_count << 1 ]; \ 1033 ne10_radix8x4_forward_##scaled##_neon (Fout, Fin, fstride);\ 1044 ne10_radix4x4_without_twiddles_forward_##scaled##_neon (Fout, Fin, fstride); \ 1055 for (; stage_count > 1 ; stage_count--) \ 1058 for (f_count = 0; f_count < fstride; f_count ++) \ 1060 Fout1 = & Fout[ f_count * mstride << 2 ]; \ 1062 ne10_radix4x4_with_twiddles_forward_##scaled##_neon (Fout1, Fin1, tw1, N, mstride, mstride); \ 1065 tw += mstride * 3; \ 1077 for (f_count = 0; f_count < fstride; f_count ++) \ 1080 ne10_radix4x4_with_twiddles_forward_##scaled##_neon (Fout1, Fin1, tw1, N, N, mstride); \ 1087 #define ne10_mixed_radix_fft_backward_int16_neon(scaled) \ 1088 void ne10_mixed_radix_fft_backward_int16_##scaled##_neon (ne10_fft_cpx_int16_t * Fout, \ 1089 ne10_fft_cpx_int16_t * Fin, \ 1090 ne10_int32_t * factors, \ 1091 ne10_fft_cpx_int16_t * twiddles, \ 1092 ne10_fft_cpx_int16_t * buffer) \ 1094 ne10_int32_t fstride, mstride, N; \ 1095 ne10_int32_t fstride1; \ 1096 ne10_int32_t f_count; \ 1097 ne10_int32_t stage_count; \ 1099 ne10_fft_cpx_int16_t *Fin1, *Fout1; \ 1100 ne10_fft_cpx_int16_t *Fout_ls = Fout; \ 1101 ne10_fft_cpx_int16_t *Ftmp; \ 1102 ne10_fft_cpx_int16_t *tw, *tw1; \ 1105 stage_count = factors[0]; \ 1106 fstride = factors[1]; \ 1107 mstride = factors[ (stage_count << 1) - 1 ]; \ 1108 N = factors[ stage_count << 1 ]; \ 1117 ne10_radix8x4_backward_##scaled##_neon (Fout, Fin, fstride);\ 1128 ne10_radix4x4_without_twiddles_backward_##scaled##_neon (Fout, Fin, fstride); \ 1139 for (; stage_count > 1 ; stage_count--) \ 1142 for (f_count = 0; f_count < fstride; f_count ++) \ 1144 Fout1 = & Fout[ f_count * mstride << 2 ]; \ 1146 ne10_radix4x4_with_twiddles_backward_##scaled##_neon (Fout1, Fin1, tw1, N, mstride, mstride); \ 1149 tw += mstride * 3; \ 1161 for (f_count = 0; f_count < fstride; f_count ++) \ 1164 ne10_radix4x4_with_twiddles_backward_##scaled##_neon (Fout1, Fin1, tw1, N, N, mstride); \ 1187 int16x8x2_t q2_fpk, q2_fpnk, q2_tw, q2_dst, q2_dst2;
1188 int16x8_t q_fpnk_r, q_fpnk_i;
1189 int16x8_t q_f1k_r, q_f1k_i, q_f2k_r, q_f2k_i;
1190 int16x8_t q_tw_r, q_tw_i;
1191 int16x8_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
1192 int16x8_t q_dst2_r, q_dst2_i;
1193 int16_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
1201 dst[0].r = tdc.
r + tdc.
i;
1202 dst[ncfft].r = tdc.
r - tdc.
i;
1203 dst[ncfft].i = dst[0].i = 0;
1209 for (k = 1; k <= count ; k += 8)
1211 p_src = (int16_t*) (& (src[k]));
1212 p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
1213 p_twiddles = (int16_t*) (& (twiddles[k - 1]));
1214 p_dst = (int16_t*) (& (dst[k]));
1215 p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
1217 q2_fpk = vld2q_s16 (p_src);
1218 q2_fpnk = vld2q_s16 (p_src2);
1220 q2_tw = vld2q_s16 (p_twiddles);
1221 q2_fpnk.val[0] = vrev32q_s16 (q2_fpnk.val[0]);
1222 q2_fpnk.val[1] = vrev32q_s16 (q2_fpnk.val[1]);
1223 q2_fpnk.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[0])));
1224 q2_fpnk.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[1])));
1225 q_fpnk_r = vcombine_s16 (vget_high_s16 (q2_fpnk.val[0]), vget_low_s16 (q2_fpnk.val[0]));
1226 q_fpnk_i = vcombine_s16 (vget_high_s16 (q2_fpnk.val[1]), vget_low_s16 (q2_fpnk.val[1]));
1227 q_fpnk_i = vnegq_s16 (q_fpnk_i);
1229 q_f1k_r = vhaddq_s16 (q2_fpk.val[0], q_fpnk_r);
1230 q_f1k_i = vhaddq_s16 (q2_fpk.val[1], q_fpnk_i);
1232 q_f2k_r = vhsubq_s16 (q2_fpk.val[0], q_fpnk_r);
1233 q_f2k_i = vhsubq_s16 (q2_fpk.val[1], q_fpnk_i);
1235 q_tmp0 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[0]);
1236 q_tmp1 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[1]);
1237 q_tmp2 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[1]);
1238 q_tmp3 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[0]);
1239 q_tw_r = vsubq_s16 (q_tmp0, q_tmp1);
1240 q_tw_i = vaddq_s16 (q_tmp2, q_tmp3);
1242 q_dst2_r = vhsubq_s16 (q_f1k_r, q_tw_r);
1243 q_dst2_i = vhsubq_s16 (q_tw_i, q_f1k_i);
1244 q2_dst.val[0] = vhaddq_s16 (q_f1k_r, q_tw_r);
1245 q2_dst.val[1] = vhaddq_s16 (q_f1k_i, q_tw_i);
1246 q_dst2_r = vrev32q_s16 (q_dst2_r);
1247 q_dst2_i = vrev32q_s16 (q_dst2_i);
1248 q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
1249 q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
1250 q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
1251 q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
1252 vst2q_s16 (p_dst, q2_dst);
1253 vst2q_s16 (p_dst2, q2_dst2);
1259 for (k = 1; k <= count ; k += 8)
1261 p_src = (int16_t*) (& (src[k]));
1262 p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
1263 p_twiddles = (int16_t*) (& (twiddles[k - 1]));
1264 p_dst = (int16_t*) (& (dst[k]));
1265 p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
1267 q2_fpk = vld2q_s16 (p_src);
1268 q2_fpnk = vld2q_s16 (p_src2);
1270 q2_tw = vld2q_s16 (p_twiddles);
1271 q2_fpnk.val[0] = vrev32q_s16 (q2_fpnk.val[0]);
1272 q2_fpnk.val[1] = vrev32q_s16 (q2_fpnk.val[1]);
1273 q2_fpnk.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[0])));
1274 q2_fpnk.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[1])));
1275 q_fpnk_r = vcombine_s16 (vget_high_s16 (q2_fpnk.val[0]), vget_low_s16 (q2_fpnk.val[0]));
1276 q_fpnk_i = vcombine_s16 (vget_high_s16 (q2_fpnk.val[1]), vget_low_s16 (q2_fpnk.val[1]));
1277 q_fpnk_i = vnegq_s16 (q_fpnk_i);
1279 q_f1k_r = vaddq_s16 (q2_fpk.val[0], q_fpnk_r);
1280 q_f1k_i = vaddq_s16 (q2_fpk.val[1], q_fpnk_i);
1282 q_f2k_r = vsubq_s16 (q2_fpk.val[0], q_fpnk_r);
1283 q_f2k_i = vsubq_s16 (q2_fpk.val[1], q_fpnk_i);
1285 q_tmp0 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[0]);
1286 q_tmp1 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[1]);
1287 q_tmp2 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[1]);
1288 q_tmp3 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[0]);
1289 q_tw_r = vsubq_s16 (q_tmp0, q_tmp1);
1290 q_tw_i = vaddq_s16 (q_tmp2, q_tmp3);
1292 q_dst2_r = vhsubq_s16 (q_f1k_r, q_tw_r);
1293 q_dst2_i = vhsubq_s16 (q_tw_i, q_f1k_i);
1294 q2_dst.val[0] = vhaddq_s16 (q_f1k_r, q_tw_r);
1295 q2_dst.val[1] = vhaddq_s16 (q_f1k_i, q_tw_i);
1296 q_dst2_r = vrev32q_s16 (q_dst2_r);
1297 q_dst2_i = vrev32q_s16 (q_dst2_i);
1298 q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
1299 q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
1300 q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
1301 q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
1302 vst2q_s16 (p_dst, q2_dst);
1303 vst2q_s16 (p_dst2, q2_dst2);
1311 for (k = 1; k <= ncfft / 2 ; ++k)
1314 fpnk.
r = src[ncfft - k].r;
1315 fpnk.
i = - src[ncfft - k].i;
1322 f1k.
r = fpk.
r + fpnk.
r;
1323 f1k.
i = fpk.
i + fpnk.
i;
1325 f2k.
r = fpk.
r - fpnk.
r;
1326 f2k.
i = fpk.
i - fpnk.
i;
1333 dst[k].r = (f1k.
r + tw.
r) >> 1;
1334 dst[k].i = (f1k.
i + tw.
i) >> 1;
1335 dst[ncfft - k].r = (f1k.
r - tw.
r) >> 1;
1336 dst[ncfft - k].i = (tw.
i - f1k.
i) >> 1;
1351 int16x8x2_t q2_fk, q2_fnkc, q2_tw, q2_dst, q2_dst2;
1352 int16x8_t q_fnkc_r, q_fnkc_i;
1353 int16x8_t q_fek_r, q_fek_i, q_fok_r, q_fok_i;
1354 int16x8_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
1355 int16x8_t q_dst2_r, q_dst2_i;
1356 int16_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
1359 dst[0].
r = src[0].
r + src[ncfft].
r;
1360 dst[0].
i = src[0].
r - src[ncfft].
r;
1368 for (k = 1; k <= count ; k += 8)
1370 p_src = (int16_t*) (& (src[k]));
1371 p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
1372 p_twiddles = (int16_t*) (& (twiddles[k - 1]));
1373 p_dst = (int16_t*) (& (dst[k]));
1374 p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
1376 q2_fk = vld2q_s16 (p_src);
1377 q2_fnkc = vld2q_s16 (p_src2);
1378 q2_tw = vld2q_s16 (p_twiddles);
1379 q2_fnkc.val[0] = vrev32q_s16 (q2_fnkc.val[0]);
1380 q2_fnkc.val[1] = vrev32q_s16 (q2_fnkc.val[1]);
1381 q2_fnkc.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[0])));
1382 q2_fnkc.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[1])));
1383 q_fnkc_r = vcombine_s16 (vget_high_s16 (q2_fnkc.val[0]), vget_low_s16 (q2_fnkc.val[0]));
1384 q_fnkc_i = vcombine_s16 (vget_high_s16 (q2_fnkc.val[1]), vget_low_s16 (q2_fnkc.val[1]));
1385 q_fnkc_i = vnegq_s16 (q_fnkc_i);
1387 q_fek_r = vhaddq_s16 (q2_fk.val[0], q_fnkc_r);
1388 q_fek_i = vhaddq_s16 (q2_fk.val[1], q_fnkc_i);
1389 q_tmp0 = vhsubq_s16 (q2_fk.val[0], q_fnkc_r);
1390 q_tmp1 = vhsubq_s16 (q2_fk.val[1], q_fnkc_i);
1392 q_fok_r = vqdmulhq_s16 (q_tmp0, q2_tw.val[0]);
1393 q_fok_i = vqdmulhq_s16 (q_tmp1, q2_tw.val[0]);
1394 q_tmp2 = vqdmulhq_s16 (q_tmp1, q2_tw.val[1]);
1395 q_tmp3 = vqdmulhq_s16 (q_tmp0, q2_tw.val[1]);
1396 q_fok_r = vaddq_s16 (q_fok_r, q_tmp2);
1397 q_fok_i = vsubq_s16 (q_fok_i, q_tmp3);
1399 q_dst2_r = vsubq_s16 (q_fek_r, q_fok_r);
1400 q_dst2_i = vsubq_s16 (q_fok_i, q_fek_i);
1401 q2_dst.val[0] = vaddq_s16 (q_fek_r, q_fok_r);
1402 q2_dst.val[1] = vaddq_s16 (q_fek_i, q_fok_i);
1403 q_dst2_r = vrev32q_s16 (q_dst2_r);
1404 q_dst2_i = vrev32q_s16 (q_dst2_i);
1405 q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
1406 q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
1407 q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
1408 q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
1409 vst2q_s16 (p_dst, q2_dst);
1410 vst2q_s16 (p_dst2, q2_dst2);
1417 for (k = 1; k <= count ; k += 8)
1419 p_src = (int16_t*) (& (src[k]));
1420 p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
1421 p_twiddles = (int16_t*) (& (twiddles[k - 1]));
1422 p_dst = (int16_t*) (& (dst[k]));
1423 p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
1425 q2_fk = vld2q_s16 (p_src);
1426 q2_fnkc = vld2q_s16 (p_src2);
1427 q2_tw = vld2q_s16 (p_twiddles);
1428 q2_fnkc.val[0] = vrev32q_s16 (q2_fnkc.val[0]);
1429 q2_fnkc.val[1] = vrev32q_s16 (q2_fnkc.val[1]);
1430 q2_fnkc.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[0])));
1431 q2_fnkc.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[1])));
1432 q_fnkc_r = vcombine_s16 (vget_high_s16 (q2_fnkc.val[0]), vget_low_s16 (q2_fnkc.val[0]));
1433 q_fnkc_i = vcombine_s16 (vget_high_s16 (q2_fnkc.val[1]), vget_low_s16 (q2_fnkc.val[1]));
1434 q_fnkc_i = vnegq_s16 (q_fnkc_i);
1436 q_fek_r = vaddq_s16 (q2_fk.val[0], q_fnkc_r);
1437 q_fek_i = vaddq_s16 (q2_fk.val[1], q_fnkc_i);
1438 q_tmp0 = vsubq_s16 (q2_fk.val[0], q_fnkc_r);
1439 q_tmp1 = vsubq_s16 (q2_fk.val[1], q_fnkc_i);
1441 q_fok_r = vqdmulhq_s16 (q_tmp0, q2_tw.val[0]);
1442 q_fok_i = vqdmulhq_s16 (q_tmp1, q2_tw.val[0]);
1443 q_tmp2 = vqdmulhq_s16 (q_tmp1, q2_tw.val[1]);
1444 q_tmp3 = vqdmulhq_s16 (q_tmp0, q2_tw.val[1]);
1445 q_fok_r = vaddq_s16 (q_fok_r, q_tmp2);
1446 q_fok_i = vsubq_s16 (q_fok_i, q_tmp3);
1448 q_dst2_r = vsubq_s16 (q_fek_r, q_fok_r);
1449 q_dst2_i = vsubq_s16 (q_fok_i, q_fek_i);
1450 q2_dst.val[0] = vaddq_s16 (q_fek_r, q_fok_r);
1451 q2_dst.val[1] = vaddq_s16 (q_fek_i, q_fok_i);
1452 q_dst2_r = vrev32q_s16 (q_dst2_r);
1453 q_dst2_i = vrev32q_s16 (q_dst2_i);
1454 q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
1455 q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
1456 q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
1457 q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
1458 vst2q_s16 (p_dst, q2_dst);
1459 vst2q_s16 (p_dst2, q2_dst2);
1467 for (k = 1; k <= ncfft / 2; k++)
1470 fnkc.
r = src[ncfft - k].
r;
1471 fnkc.
i = -src[ncfft - k].
i;
1478 fek.
r = fk.
r + fnkc.
r;
1479 fek.
i = fk.
i + fnkc.
i;
1481 tmp.
r = fk.
r - fnkc.
r;
1482 tmp.
i = fk.
i - fnkc.
i;
1489 dst[k].
r = fek.
r + fok.
r;
1490 dst[k].
i = fek.
i + fok.
i;
1492 dst[ncfft - k].
r = fek.
r - fok.
r;
1493 dst[ncfft - k].
i = fok.
i - fek.
i;
1518 ne10_fft2_backward_int16_scaled (fout, fin);
1521 ne10_fft4_backward_int16_scaled (fout, fin);
1524 ne10_fft8_backward_int16_scaled (fout, fin);
1539 ne10_fft2_forward_int16_scaled (fout, fin);
1542 ne10_fft4_forward_int16_scaled (fout, fin);
1545 ne10_fft8_forward_int16_scaled (fout, fin);
1563 ne10_fft2_backward_int16_unscaled (fout, fin);
1566 ne10_fft4_backward_int16_unscaled (fout, fin);
1569 ne10_fft8_backward_int16_unscaled (fout, fin);
1584 ne10_fft2_forward_int16_unscaled (fout, fin);
1587 ne10_fft4_forward_int16_unscaled (fout, fin);
1590 ne10_fft8_forward_int16_unscaled (fout, fin);
1616 c2c_state.
buffer = tmpbuf2;
1619 ne10_fft_split_r2c_1d_int16_neon (fout, tmpbuf1, cfg->
super_twiddles, cfg->
ncfft, scaled_flag);
1638 c2c_state.
buffer = tmpbuf2;
1640 ne10_fft_split_c2r_1d_int16_neon (tmpbuf1, fin, cfg->
super_twiddles, cfg->
ncfft, scaled_flag);
void ne10_mixed_radix_fft_backward_int16_scaled_neon(ne10_fft_cpx_int16_t *Fout, ne10_fft_cpx_int16_t *fin, ne10_int32_t *factors, ne10_fft_cpx_int16_t *twiddles, ne10_fft_cpx_int16_t *buffer) asm("ne10_mixed_radix_fft_backward_int16_scaled_neon")
#define RADIX4x4_WITHOUT_TW_START
Structure for the 16-bit fixed point FFT function.
ne10_fft_cpx_int16_t * twiddles
#define RADIX4x4_WITH_TW_LOAD
void ne10_mixed_radix_fft_forward_int16_scaled_neon(ne10_fft_cpx_int16_t *Fout, ne10_fft_cpx_int16_t *fin, ne10_int32_t *factors, ne10_fft_cpx_int16_t *twiddles, ne10_fft_cpx_int16_t *buffer) asm("ne10_mixed_radix_fft_forward_int16_scaled_neon")
ne10_fft_cpx_int16_t * twiddles
#define RADIX4x4_WITH_TW_START
#define RADIX4x4_WITHOUT_TW_S0
#define ne10_mixed_radix_fft_backward_int16_neon(scaled)
void ne10_fft_c2r_1d_int16_neon(ne10_int16_t *fout, ne10_fft_cpx_int16_t *fin, ne10_fft_r2c_cfg_int16_t cfg, ne10_int32_t scaled_flag)
Specific implementation of ne10_fft_c2r_1d_int16 using NEON SIMD capabilities.
#define RADIX8x4_FWD_S357
void ne10_fft_r2c_1d_int16_neon(ne10_fft_cpx_int16_t *fout, ne10_int16_t *fin, ne10_fft_r2c_cfg_int16_t cfg, ne10_int32_t scaled_flag)
Specific implementation of ne10_fft_r2c_1d_int16 using NEON SIMD capabilities.
#define RADIX4x4_WITH_TW_LS_02
#define NE10_F2I16_SAMPPROD
#define RADIX8x4_FS_S0_SCALED
ne10_fft_cpx_int16_t * super_twiddles
#define RADIX8x4_INV_S357
void ne10_mixed_radix_fft_forward_int16_unscaled_neon(ne10_fft_cpx_int16_t *Fout, ne10_fft_cpx_int16_t *fin, ne10_int32_t *factors, ne10_fft_cpx_int16_t *twiddles, ne10_fft_cpx_int16_t *buffer) asm("ne10_mixed_radix_fft_forward_int16_unscaled_neon")
#define RADIX4x4_WITH_TW_STORE
void ne10_mixed_radix_fft_backward_int16_unscaled_neon(ne10_fft_cpx_int16_t *Fout, ne10_fft_cpx_int16_t *fin, ne10_int32_t *factors, ne10_fft_cpx_int16_t *twiddles, ne10_fft_cpx_int16_t *buffer) asm("ne10_mixed_radix_fft_backward_int16_unscaled_neon")
#define RADIX4x4_WITH_TW_LS_02_SCALED
#define RADIX4x4_WITHOUT_TW_LOAD
#define NE10_F2I16_FIXDIV(c, div)
#define RADIX4x4_WITHOUT_TW_S0_SCALED
#define RADIX8x4_LS_02_SCALED
ne10_fft_cpx_int16_t * buffer
#define RADIX4x4_WITH_TW_S1_INV
#define RADIX4x4_WITH_TW_S1_FWD
#define ne10_mixed_radix_fft_forward_int16_neon(scaled)
#define RADIX4x4_WITHOUT_TW_STORE
ne10_fft_cpx_int16_t * buffer
void ne10_fft_c2c_1d_int16_neon(ne10_fft_cpx_int16_t *fout, ne10_fft_cpx_int16_t *fin, ne10_fft_cfg_int16_t cfg, ne10_int32_t inverse_fft, ne10_int32_t scaled_flag)
Specific implementation of ne10_fft_c2c_1d_int16 using NEON SIMD capabilities.