32 #ifndef NE10_FFT_NEONINTRINSIC_H 33 #define NE10_FFT_NEONINTRINSIC_H 38 #define NE10_CPX_ADD_NEON_F32(Z,A,B) do { \ 39 Z.val[0] = A.val[0] + B.val[0]; \ 40 Z.val[1] = A.val[1] + B.val[1]; \ 43 #define NE10_CPX_SUB_NEON_F32(Z,A,B) do { \ 44 Z.val[0] = A.val[0] - B.val[0]; \ 45 Z.val[1] = A.val[1] - B.val[1]; \ 48 #define NE10_CPX_MUL_NEON_F32(Z,A,B) do { \ 49 float32x4_t ARBR = vmulq_f32( A.val[0], B.val[0] ); \ 50 float32x4_t ARBI = vmulq_f32( A.val[0], B.val[1] ); \ 51 Z.val[0] = vmlsq_f32(ARBR, A.val[1], B.val[1]); \ 52 Z.val[1] = vmlaq_f32(ARBI, A.val[1], B.val[0]); \ 55 #define NE10_CPX_MUL_INV_NEON_F32(Z,A,B) do { \ 56 float32x4_t ARBR = vmulq_f32( A.val[0], B.val[0] ); \ 57 float32x4_t AIBI = vmulq_f32( A.val[1], B.val[1] ); \ 58 float32x4_t ARBI = vmulq_f32( A.val[0], B.val[1] ); \ 59 float32x4_t AIBR = vmulq_f32( A.val[1], B.val[0] ); \ 60 Z.val[0] = vaddq_f32(ARBR,AIBI); \ 61 Z.val[1] = vsubq_f32(AIBR,ARBI); \ 64 #define NE10_BUTTERFLY_NEON_F32(O1,O2,I1,I2) do { \ 65 NE10_CPX_ADD_NEON(O1,I1,I2); \ 66 NE10_CPX_SUB_NEON(O2,I1,I2); \ 69 #define NE10_DECLARE_2(TYPE,NAME) TYPE NAME ## 0; \ 72 #define NE10_DECLARE_3(TYPE,NAME) NE10_DECLARE_2(TYPE,NAME); \ 75 #define NE10_DECLARE_4(TYPE,NAME) NE10_DECLARE_3(TYPE,NAME); \ 78 #define NE10_DECLARE_8(TYPE,NAME) NE10_DECLARE_4(TYPE,NAME); \ 84 #define NE10_REVERSE_FLOAT32X4(VECTOR4F) do { \ 85 VECTOR4F = vrev64q_f32(VECTOR4F); \ 86 VECTOR4F = vcombine_f32( vget_high_f32( VECTOR4F ), vget_low_f32( VECTOR4F ) ); \ 89 #define NE10_REVERSE_OUT_FLOAT32X4(VECTOR4F_OUT,VECTOR4F) do { \ 90 float32x4_t Q_TMP = vrev64q_f32(VECTOR4F); \ 91 VECTOR4F_OUT = vcombine_f32( vget_high_f32( Q_TMP ), vget_low_f32( Q_TMP ) ); \ 94 #define NE10_RADIX4X4C_TRANSPOSE_NEON(Q2_OUT,Q2_IN) do { \ 95 NE10_DECLARE_4(float32x4x2_t,q2_tmp); \ 96 q2_tmp0 = vtrnq_f32 (Q2_IN ## 0 .val[0], Q2_IN ## 1 .val[0]); \ 97 q2_tmp1 = vtrnq_f32 (Q2_IN ## 0 .val[1], Q2_IN ## 1 .val[1]); \ 98 q2_tmp2 = vtrnq_f32 (Q2_IN ## 2 .val[0], Q2_IN ## 3 .val[0]); \ 99 q2_tmp3 = vtrnq_f32 (Q2_IN ## 2 .val[1], Q2_IN ## 3 .val[1]); \ 100 Q2_OUT ## 0 .val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[0]), vget_low_f32 (q2_tmp2.val[0])); \ 101 Q2_OUT ## 0 .val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[0]), vget_low_f32 (q2_tmp3.val[0])); \ 102 Q2_OUT ## 1 .val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[1]), vget_low_f32 (q2_tmp2.val[1])); \ 103 Q2_OUT ## 1 .val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[1]), vget_low_f32 (q2_tmp3.val[1])); \ 104 Q2_OUT ## 2 .val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[0]), vget_high_f32 (q2_tmp2.val[0])); \ 105 Q2_OUT ## 2 .val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[0]), vget_high_f32 (q2_tmp3.val[0])); \ 106 Q2_OUT ## 3 .val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[1]), vget_high_f32 (q2_tmp2.val[1])); \ 107 Q2_OUT ## 3 .val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[1]), vget_high_f32 (q2_tmp3.val[1])); \ 110 #define VDUPQ_N_F32(VAR) { VAR, VAR, VAR, VAR } 112 #define CONST_TW_81 0.70710678 113 #define CONST_TW_81N -0.70710678 118 #define DIV_TW81 1.4142136f 119 #define DIV_TW81N -1.4142136f 124 #define NE10_RADIX8x4_R2C_NEON_KERNEL_S1(Q_OUT,Q_IN) do { \ 125 Q_OUT ## 0 = vaddq_f32 (Q_IN ## 0, Q_IN ## 4); \ 126 Q_OUT ## 1 = vsubq_f32 (Q_IN ## 0, Q_IN ## 4); \ 127 Q_OUT ## 2 = vaddq_f32 (Q_IN ## 1, Q_IN ## 5); \ 128 Q_OUT ## 3 = vsubq_f32 (Q_IN ## 1, Q_IN ## 5); \ 129 Q_OUT ## 4 = vaddq_f32 (Q_IN ## 2, Q_IN ## 6); \ 130 Q_OUT ## 5 = vsubq_f32 (Q_IN ## 2, Q_IN ## 6); \ 131 Q_OUT ## 6 = vaddq_f32 (Q_IN ## 3, Q_IN ## 7); \ 132 Q_OUT ## 7 = vsubq_f32 (Q_IN ## 3, Q_IN ## 7); \ 133 Q_OUT ## 3 = vmulq_f32 (Q_OUT ## 3, Q_TW_81 ); \ 134 Q_OUT ## 7 = vmulq_f32 (Q_OUT ## 7, Q_TW_81N); \ 137 #define NE10_RADIX8x4_R2C_NEON_KERNEL_S2(Q_OUT,Q_IN) do { \ 138 NE10_DECLARE_4(float32x4_t,Q_S); \ 139 Q_S0 = vaddq_f32 (Q_IN ## 0, Q_IN ## 4); \ 140 Q_S1 = vaddq_f32 (Q_IN ## 2, Q_IN ## 6); \ 141 Q_S2 = vsubq_f32 (Q_IN ## 7, Q_IN ## 3); \ 142 Q_S3 = vaddq_f32 (Q_IN ## 3, Q_IN ## 7); \ 143 Q_OUT ## 0 = vaddq_f32 ( Q_S0, Q_S1 ); \ 144 Q_OUT ## 1 = vaddq_f32 ( Q_IN ## 1, Q_S3 ); \ 145 Q_OUT ## 2 = vsubq_f32 ( Q_S2, Q_IN ## 5 ); \ 146 Q_OUT ## 3 = vsubq_f32 ( Q_IN ## 0, Q_IN ## 4 ); \ 147 Q_OUT ## 4 = vsubq_f32 ( Q_IN ## 6, Q_IN ## 2 ); \ 148 Q_OUT ## 5 = vsubq_f32 ( Q_IN ## 1, Q_S3 ); \ 149 Q_OUT ## 6 = vaddq_f32 ( Q_IN ## 5, Q_S2 ); \ 150 Q_OUT ## 7 = vsubq_f32 ( Q_S0, Q_S1 ); \ 153 #define NE10_RADIX8x4_C2R_NEON_KERNEL_S1(Q_OUT,Q_IN) do { \ 154 NE10_DECLARE_8(float32x4_t,Q_S_IN); \ 155 Q_S_IN0 = vaddq_f32(Q_IN ## 0, Q_IN ## 7); \ 156 Q_S_IN1 = vsubq_f32(Q_IN ## 0, Q_IN ## 7); \ 157 Q_S_IN2 = vaddq_f32(Q_IN ## 1, Q_IN ## 5); \ 158 Q_S_IN3 = vsubq_f32(Q_IN ## 1, Q_IN ## 5); \ 159 Q_S_IN4 = vaddq_f32(Q_IN ## 6, Q_IN ## 2); \ 160 Q_S_IN5 = vsubq_f32(Q_IN ## 6, Q_IN ## 2); \ 161 Q_S_IN6 = vaddq_f32(Q_IN ## 3, Q_IN ## 3); \ 162 Q_S_IN7 = vaddq_f32(Q_IN ## 4, Q_IN ## 4); \ 163 Q_OUT ## 0 = vaddq_f32(Q_S_IN0, Q_S_IN6); \ 164 Q_OUT ## 1 = vaddq_f32(Q_S_IN2, Q_S_IN2); \ 165 Q_OUT ## 2 = vsubq_f32(Q_S_IN1, Q_S_IN7); \ 166 Q_OUT ## 3 = vsubq_f32(Q_S_IN3, Q_S_IN4); \ 167 Q_OUT ## 4 = vsubq_f32(Q_S_IN0, Q_S_IN6); \ 168 Q_OUT ## 5 = vaddq_f32(Q_S_IN5, Q_S_IN5); \ 169 Q_OUT ## 6 = vaddq_f32(Q_S_IN1, Q_S_IN7); \ 170 Q_OUT ## 7 = vaddq_f32(Q_S_IN4, Q_S_IN3); \ 173 #define NE10_RADIX8x4_C2R_NEON_KERNEL_S2(Q_OUT,Q_IN) do { \ 174 Q_IN ## 3 = vmulq_f32(Q_IN ## 3,DIV_TW81_NEON); \ 175 Q_IN ## 7 = vmulq_f32(Q_IN ## 7,DIV_TW81N_NEON); \ 176 Q_OUT ## 0 = vaddq_f32(Q_IN ## 0, Q_IN ## 1); \ 177 Q_OUT ## 4 = vsubq_f32(Q_IN ## 0, Q_IN ## 1); \ 178 Q_OUT ## 1 = vaddq_f32(Q_IN ## 2, Q_IN ## 3); \ 179 Q_OUT ## 5 = vsubq_f32(Q_IN ## 2, Q_IN ## 3); \ 180 Q_OUT ## 2 = vaddq_f32(Q_IN ## 4, Q_IN ## 5); \ 181 Q_OUT ## 6 = vsubq_f32(Q_IN ## 4, Q_IN ## 5); \ 182 Q_OUT ## 3 = vaddq_f32(Q_IN ## 6, Q_IN ## 7); \ 183 Q_OUT ## 7 = vsubq_f32(Q_IN ## 6, Q_IN ## 7); \ 186 #define NE10_RADIX8x4_C2R_NEON_KERNEL_SCALE(Q_OUT) do { \ 187 Q_OUT ## 0 = vmulq_f32( Q_OUT ## 0, EIGH_NEON); \ 188 Q_OUT ## 1 = vmulq_f32( Q_OUT ## 1, EIGH_NEON); \ 189 Q_OUT ## 2 = vmulq_f32( Q_OUT ## 2, EIGH_NEON); \ 190 Q_OUT ## 3 = vmulq_f32( Q_OUT ## 3, EIGH_NEON); \ 191 Q_OUT ## 4 = vmulq_f32( Q_OUT ## 4, EIGH_NEON); \ 192 Q_OUT ## 5 = vmulq_f32( Q_OUT ## 5, EIGH_NEON); \ 193 Q_OUT ## 6 = vmulq_f32( Q_OUT ## 6, EIGH_NEON); \ 194 Q_OUT ## 7 = vmulq_f32( Q_OUT ## 7, EIGH_NEON); \ 197 #define NE10_RADIX4x4_C2R_NEON_KERNEL_SCALE(Q_OUT) do { \ 198 Q_OUT ## 0 = vmulq_f32( Q_OUT ## 0, QUAD_NEON); \ 199 Q_OUT ## 1 = vmulq_f32( Q_OUT ## 1, QUAD_NEON); \ 200 Q_OUT ## 2 = vmulq_f32( Q_OUT ## 2, QUAD_NEON); \ 201 Q_OUT ## 3 = vmulq_f32( Q_OUT ## 3, QUAD_NEON); \ 204 #define NE10_RADIX4x4_C2R_TW_NEON_KERNEL_SCALE(Q2_OUT) do { \ 205 Q2_OUT ## 0 .val[0] = vmulq_f32( Q2_OUT ## 0 .val[0], QUAD_NEON); \ 206 Q2_OUT ## 1 .val[0] = vmulq_f32( Q2_OUT ## 1 .val[0], QUAD_NEON); \ 207 Q2_OUT ## 2 .val[0] = vmulq_f32( Q2_OUT ## 2 .val[0], QUAD_NEON); \ 208 Q2_OUT ## 3 .val[0] = vmulq_f32( Q2_OUT ## 3 .val[0], QUAD_NEON); \ 209 Q2_OUT ## 0 .val[1] = vmulq_f32( Q2_OUT ## 0 .val[1], QUAD_NEON); \ 210 Q2_OUT ## 1 .val[1] = vmulq_f32( Q2_OUT ## 1 .val[1], QUAD_NEON); \ 211 Q2_OUT ## 2 .val[1] = vmulq_f32( Q2_OUT ## 2 .val[1], QUAD_NEON); \ 212 Q2_OUT ## 3 .val[1] = vmulq_f32( Q2_OUT ## 3 .val[1], QUAD_NEON); \ 215 #define NE10_RADIX8x4_R2C_NEON_KERNEL(Q_OUT,Q_IN) do { \ 216 NE10_DECLARE_8(float32x4_t,Q_S_IN); \ 217 NE10_RADIX8x4_R2C_NEON_KERNEL_S1(Q_S_IN,Q_IN); \ 218 NE10_RADIX8x4_R2C_NEON_KERNEL_S2(Q_OUT,Q_S_IN); \ 221 #define NE10_RADIX4x4_R2C_NEON_KERNEL(Q_OUT,Q_IN) do { \ 222 NE10_DECLARE_4(float32x4_t,Q_S_IN); \ 223 Q_S_IN0 = vaddq_f32 (Q_IN ## 0, Q_IN ## 2); \ 224 Q_S_IN1 = vaddq_f32 (Q_IN ## 1, Q_IN ## 3); \ 225 Q_OUT ## 0 = vaddq_f32 (Q_S_IN0, Q_S_IN1); \ 226 Q_OUT ## 1 = vsubq_f32 (Q_IN##0, Q_IN##2); \ 227 Q_OUT ## 2 = vsubq_f32 (Q_IN##3, Q_IN##1); \ 228 Q_OUT ## 3 = vsubq_f32 (Q_S_IN0, Q_S_IN1); \ 231 #define NE10_RADIX4x4_C2R_NEON_KERNEL(Q_OUT,Q_IN) do { \ 232 NE10_DECLARE_4(float32x4_t,Q_S_IN); \ 233 Q_S_IN0 = vaddq_f32 (Q_IN##0, Q_IN##3); \ 234 Q_S_IN1 = vsubq_f32 (Q_IN##0, Q_IN##3); \ 235 Q_S_IN2 = vaddq_f32 (Q_IN##1, Q_IN##1); \ 236 Q_S_IN3 = vaddq_f32 (Q_IN##2, Q_IN##2); \ 237 Q_OUT ## 0 = vaddq_f32 (Q_S_IN0, Q_S_IN2); \ 238 Q_OUT ## 1 = vsubq_f32 (Q_S_IN1, Q_S_IN3); \ 239 Q_OUT ## 2 = vsubq_f32 (Q_S_IN0, Q_S_IN2); \ 240 Q_OUT ## 3 = vaddq_f32 (Q_S_IN1, Q_S_IN3); \ 243 #define NE10_RADIX8x4_C2R_NEON_KERNEL(Q_OUT,Q_IN) do { \ 244 NE10_DECLARE_8(float32x4_t,Q_S_IN_C2R_KERNEL); \ 245 NE10_RADIX8x4_C2R_NEON_KERNEL_S1(Q_S_IN_C2R_KERNEL,Q_IN); \ 246 NE10_RADIX8x4_C2R_NEON_KERNEL_S2(Q_OUT,Q_S_IN_C2R_KERNEL); \ 249 #define NE10_RADIX8x4_R2C_NEON_LOAD(PTR_IN,Q_IN,IN_STEP) do { \ 250 Q_IN ## 0 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \ 252 Q_IN ## 1 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \ 254 Q_IN ## 2 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \ 256 Q_IN ## 3 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \ 258 Q_IN ## 4 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \ 260 Q_IN ## 5 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \ 262 Q_IN ## 6 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \ 264 Q_IN ## 7 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \ 268 #define NE10_RADIX4x4_R2C_NEON_LOAD(PTR_IN,Q_IN,IN_STEP) do {\ 269 Q_IN ## 0 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \ 271 Q_IN ## 1 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \ 273 Q_IN ## 2 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \ 275 Q_IN ## 3 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \ 279 #define NE10_RADIX8x4_R2C_NEON_STORE(PTR_OUT,Q_OUT,OUT_STEP) do { \ 280 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 0 * OUT_STEP ), Q_OUT ## 0); \ 281 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 1 * OUT_STEP ), Q_OUT ## 1); \ 282 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 2 * OUT_STEP ), Q_OUT ## 2); \ 283 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 3 * OUT_STEP ), Q_OUT ## 3); \ 284 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 4 * OUT_STEP ), Q_OUT ## 4); \ 285 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 5 * OUT_STEP ), Q_OUT ## 5); \ 286 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 6 * OUT_STEP ), Q_OUT ## 6); \ 287 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 7 * OUT_STEP ), Q_OUT ## 7); \ 290 #define NE10_RADIX4x4_R2C_NEON_STORE(PTR_OUT,Q_OUT,OUT_STEP) do { \ 291 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 0 * OUT_STEP ), Q_OUT ## 0); \ 292 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 1 * OUT_STEP ), Q_OUT ## 1); \ 293 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 2 * OUT_STEP ), Q_OUT ## 2); \ 294 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 3 * OUT_STEP ), Q_OUT ## 3); \ 297 #define NE10_RADIX4x4_R2C_TW_MUL_NEON(Q2_OUT,Q2_IN,Q2_TW) do { \ 298 Q2_OUT ## 0 = Q2_IN ## 0; \ 299 NE10_CPX_MUL_NEON_F32(Q2_OUT ## 1,Q2_IN ## 1,Q2_TW ## 0); \ 300 NE10_CPX_MUL_NEON_F32(Q2_OUT ## 2,Q2_IN ## 2,Q2_TW ## 1); \ 301 NE10_CPX_MUL_NEON_F32(Q2_OUT ## 3,Q2_IN ## 3,Q2_TW ## 2); \ 304 #define NE10_RADIX4x4_C2R_TW_MUL_NEON(Q2_OUT,Q2_IN,Q2_TW) do { \ 305 Q2_OUT ## 0 = Q2_IN ## 0; \ 306 NE10_CPX_MUL_INV_NEON_F32(Q2_OUT ## 1,Q2_IN ## 1,Q2_TW ## 0); \ 307 NE10_CPX_MUL_INV_NEON_F32(Q2_OUT ## 2,Q2_IN ## 2,Q2_TW ## 1); \ 308 NE10_CPX_MUL_INV_NEON_F32(Q2_OUT ## 3,Q2_IN ## 3,Q2_TW ## 2); \ 311 #define NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S1(Q2_OUT,Q2_IN) do { \ 312 NE10_CPX_ADD_NEON_F32(Q2_OUT ## 0,Q2_IN ## 0,Q2_IN ## 2); \ 313 NE10_CPX_SUB_NEON_F32(Q2_OUT ## 1,Q2_IN ## 0,Q2_IN ## 2); \ 314 NE10_CPX_ADD_NEON_F32(Q2_OUT ## 2,Q2_IN ## 1,Q2_IN ## 3); \ 315 NE10_CPX_SUB_NEON_F32(Q2_OUT ## 3,Q2_IN ## 1,Q2_IN ## 3); \ 318 #define NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S2(Q2_OUT,Q2_IN) do { \ 319 Q2_OUT ## 0 .val[0] = vaddq_f32(Q2_IN ## 0 .val[0] , Q2_IN ## 2 .val[0]); \ 320 Q2_OUT ## 0 .val[1] = vaddq_f32(Q2_IN ## 0 .val[1] , Q2_IN ## 2 .val[1]); \ 321 Q2_OUT ## 2 .val[0] = vsubq_f32(Q2_IN ## 0 .val[0] , Q2_IN ## 2 .val[0]); \ 322 Q2_OUT ## 2 .val[1] = vsubq_f32(Q2_IN ## 2 .val[1] , Q2_IN ## 0 .val[1]); \ 323 Q2_OUT ## 1 .val[0] = vaddq_f32(Q2_IN ## 1 .val[0] , Q2_IN ## 3 .val[1]); \ 324 Q2_OUT ## 1 .val[1] = vsubq_f32(Q2_IN ## 1 .val[1] , Q2_IN ## 3 .val[0]); \ 325 Q2_OUT ## 3 .val[0] = vsubq_f32(Q2_IN ## 1 .val[0] , Q2_IN ## 3 .val[1]); \ 326 Q2_OUT ## 3 .val[1] = vaddq_f32(Q2_IN ## 3 .val[0] , Q2_IN ## 1 .val[1]); \ 327 Q2_OUT ## 3 .val[1] = vnegq_f32(Q2_OUT ## 3 .val[1]); \ 330 #define NE10_RADIX4x4_R2C_TW_NEON_KERNEL_LAST(Q_OUT,Q_IN) do { \ 332 Q_IN ## 1 = vmulq_f32(Q_IN ## 1, Q_TW_81); \ 333 Q_IN ## 3 = vmulq_f32(Q_IN ## 3, Q_TW_81); \ 334 Q_TMP = vsubq_f32(Q_IN ## 1, Q_IN ## 3); \ 335 Q_IN ## 3 = vaddq_f32(Q_IN ## 1, Q_IN ## 3); \ 337 Q_OUT ## 0 = vaddq_f32(Q_IN ## 0, Q_IN ## 1); \ 338 Q_OUT ## 1 = vaddq_f32(Q_IN ## 2, Q_IN ## 3); \ 339 Q_OUT ## 2 = vsubq_f32(Q_IN ## 0, Q_IN ## 1); \ 340 Q_OUT ## 3 = vsubq_f32(Q_IN ## 2, Q_IN ## 3); \ 341 Q_OUT ## 1 = vnegq_f32(Q_OUT ## 1); \ 344 #define NE10_RADIX4x4_C2R_TW_NEON_KERNEL_LAST(Q_OUT,Q_IN) do { \ 346 Q_IN ## 1 = vnegq_f32(Q_IN ## 1 ); \ 347 Q_OUT ## 0 = vaddq_f32(Q_IN ## 0, Q_IN ## 2); \ 348 Q_OUT ## 1 = vsubq_f32(Q_IN ## 0, Q_IN ## 2); \ 349 Q_OUT ## 2 = vaddq_f32(Q_IN ## 1, Q_IN ## 3); \ 350 Q_OUT ## 3 = vsubq_f32(Q_IN ## 1, Q_IN ## 3); \ 351 Q_TMP = vaddq_f32(Q_OUT ## 1, Q_OUT ## 3); \ 352 Q_OUT ## 3 = vsubq_f32(Q_OUT ## 3, Q_OUT ## 1); \ 353 Q_OUT ## 1 = Q_TMP; \ 354 Q_OUT ## 1 = vmulq_f32( Q_OUT ## 1, DIV_TW81_NEON); \ 355 Q_OUT ## 3 = vmulq_f32( Q_OUT ## 3, DIV_TW81_NEON); \ 356 Q_OUT ## 0 = vaddq_f32( Q_OUT ## 0, Q_OUT ## 0 ); \ 357 Q_OUT ## 2 = vaddq_f32( Q_OUT ## 2, Q_OUT ## 2 ); \ 360 #define NE10_RADIX4x4_C2R_TW_NEON_KERNEL_S1(Q2_OUT,Q2_IN) do { \ 361 Q2_IN ## 3 .val[1] = vnegq_f32(Q2_IN ## 3 .val[1]); \ 362 Q2_OUT ## 0 .val[0] = vaddq_f32(Q2_IN ## 0 .val[0] , Q2_IN ## 2 .val[0]); \ 363 Q2_OUT ## 0 .val[1] = vsubq_f32(Q2_IN ## 0 .val[1] , Q2_IN ## 2 .val[1]); \ 364 Q2_OUT ## 2 .val[0] = vsubq_f32(Q2_IN ## 0 .val[0] , Q2_IN ## 2 .val[0]); \ 365 Q2_OUT ## 2 .val[1] = vaddq_f32(Q2_IN ## 2 .val[1] , Q2_IN ## 0 .val[1]); \ 366 Q2_OUT ## 1 .val[0] = vaddq_f32(Q2_IN ## 1 .val[0] , Q2_IN ## 3 .val[0]); \ 367 Q2_OUT ## 1 .val[1] = vaddq_f32(Q2_IN ## 1 .val[1] , Q2_IN ## 3 .val[1]); \ 368 Q2_OUT ## 3 .val[0] = vsubq_f32(Q2_IN ## 3 .val[1] , Q2_IN ## 1 .val[1]); \ 369 Q2_OUT ## 3 .val[1] = vsubq_f32(Q2_IN ## 1 .val[0] , Q2_IN ## 3 .val[0]); \ 372 #define NE10_RADIX4x4_C2R_TW_NEON_KERNEL_S2(Q2_OUT,Q2_IN) do { \ 373 NE10_CPX_ADD_NEON_F32(Q2_OUT ## 0,Q2_IN ## 0,Q2_IN ## 1); \ 374 NE10_CPX_SUB_NEON_F32(Q2_OUT ## 2,Q2_IN ## 0,Q2_IN ## 1); \ 375 NE10_CPX_ADD_NEON_F32(Q2_OUT ## 1,Q2_IN ## 2,Q2_IN ## 3); \ 376 NE10_CPX_SUB_NEON_F32(Q2_OUT ## 3,Q2_IN ## 2,Q2_IN ## 3); \ 379 #define NE10_RADIX4x4_R2C_TW_NEON_KERNEL(Q2_OUT,Q2_IN,Q2_TW) do { \ 380 NE10_RADIX4x4_R2C_TW_MUL_NEON(Q2_OUT,Q2_IN,Q2_TW); \ 381 NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S1(Q2_IN,Q2_OUT); \ 382 NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S2(Q2_OUT,Q2_IN); \ 385 #define NE10_RADIX4x4_C2R_TW_NEON_KERNEL(Q2_OUT,Q2_IN,Q2_TW) do { \ 386 NE10_RADIX4x4_C2R_TW_NEON_KERNEL_S1(Q2_OUT,Q2_IN); \ 387 NE10_RADIX4x4_C2R_TW_NEON_KERNEL_S2(Q2_IN,Q2_OUT); \ 388 NE10_RADIX4x4_C2R_TW_MUL_NEON(Q2_OUT,Q2_IN,Q2_TW); \ 392 #define NE10_PRINT_Qx8_VECTOR(Q_VECTOR) do { \ 393 fprintf(stderr,"inside %s\n", __FUNCTION__ ); \ 394 fprintf(stderr, #Q_VECTOR "\n"); \ 395 fprintf(stderr,"0:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 0[0], Q_VECTOR ## 0[1], Q_VECTOR ## 0[2], Q_VECTOR ## 0[3] ); \ 396 fprintf(stderr,"1:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 1[0], Q_VECTOR ## 1[1], Q_VECTOR ## 1[2], Q_VECTOR ## 1[3] ); \ 397 fprintf(stderr,"2:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 2[0], Q_VECTOR ## 2[1], Q_VECTOR ## 2[2], Q_VECTOR ## 2[3] ); \ 398 fprintf(stderr,"3:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 3[0], Q_VECTOR ## 3[1], Q_VECTOR ## 3[2], Q_VECTOR ## 3[3] ); \ 399 fprintf(stderr,"4:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 4[0], Q_VECTOR ## 4[1], Q_VECTOR ## 4[2], Q_VECTOR ## 4[3] ); \ 400 fprintf(stderr,"5:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 5[0], Q_VECTOR ## 5[1], Q_VECTOR ## 5[2], Q_VECTOR ## 5[3] ); \ 401 fprintf(stderr,"6:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 6[0], Q_VECTOR ## 6[1], Q_VECTOR ## 6[2], Q_VECTOR ## 6[3] ); \ 402 fprintf(stderr,"7:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 7[0], Q_VECTOR ## 7[1], Q_VECTOR ## 7[2], Q_VECTOR ## 7[3] ); \ 404 #define NE10_PRINT_Qx4_VECTOR(Q_VECTOR) do { \ 405 fprintf(stderr,"inside %s\n", __FUNCTION__ ); \ 406 fprintf(stderr, #Q_VECTOR "\n"); \ 407 fprintf(stderr,"0:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 0[0], Q_VECTOR ## 0[1], Q_VECTOR ## 0[2], Q_VECTOR ## 0[3] ); \ 408 fprintf(stderr,"1:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 1[0], Q_VECTOR ## 1[1], Q_VECTOR ## 1[2], Q_VECTOR ## 1[3] ); \ 409 fprintf(stderr,"2:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 2[0], Q_VECTOR ## 2[1], Q_VECTOR ## 2[2], Q_VECTOR ## 2[3] ); \ 410 fprintf(stderr,"3:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 3[0], Q_VECTOR ## 3[1], Q_VECTOR ## 3[2], Q_VECTOR ## 3[3] ); \ 412 #define NE10_PRINT_Q2x4_VECTOR(Q_VECTOR) do { \ 413 fprintf(stderr,"inside %s\n", __FUNCTION__ ); \ 414 fprintf(stderr, #Q_VECTOR "\n"); \ 415 fprintf(stderr,"0R:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 0 .val[0][0], Q_VECTOR ## 0 .val[0][1], Q_VECTOR ## 0 .val[0][2], Q_VECTOR ## 0 .val[0][3] ); \ 416 fprintf(stderr,"1R:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 1 .val[0][0], Q_VECTOR ## 1 .val[0][1], Q_VECTOR ## 1 .val[0][2], Q_VECTOR ## 1 .val[0][3] ); \ 417 fprintf(stderr,"2R:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 2 .val[0][0], Q_VECTOR ## 2 .val[0][1], Q_VECTOR ## 2 .val[0][2], Q_VECTOR ## 2 .val[0][3] ); \ 418 fprintf(stderr,"3R:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 3 .val[0][0], Q_VECTOR ## 3 .val[0][1], Q_VECTOR ## 3 .val[0][2], Q_VECTOR ## 3 .val[0][3] ); \ 419 fprintf(stderr,"0I:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 0 .val[1][0], Q_VECTOR ## 0 .val[1][1], Q_VECTOR ## 0 .val[1][2], Q_VECTOR ## 0 .val[1][3] ); \ 420 fprintf(stderr,"1I:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 1 .val[1][0], Q_VECTOR ## 1 .val[1][1], Q_VECTOR ## 1 .val[1][2], Q_VECTOR ## 1 .val[1][3] ); \ 421 fprintf(stderr,"2I:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 2 .val[1][0], Q_VECTOR ## 2 .val[1][1], Q_VECTOR ## 2 .val[1][2], Q_VECTOR ## 2 .val[1][3] ); \ 422 fprintf(stderr,"3I:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 3 .val[1][0], Q_VECTOR ## 3 .val[1][1], Q_VECTOR ## 3 .val[1][2], Q_VECTOR ## 3 .val[1][3] ); \ 424 #else // NE10_VERBOSE not defined 425 #define NE10_PRINT_Qx8_VECTOR(Q_VECTOR) ; 426 #define NE10_PRINT_Qx4_VECTOR(Q_VECTOR) ; 427 #define NE10_PRINT_Q2x4_VECTOR(Q2_VECTOR) ; 428 #endif // NE10_VERBOSE