Project Ne10
An open, optimized software library for the ARM architecture.
NE10_fft.neonintrinsic.h
Go to the documentation of this file.
1 /*
2  * Copyright 2014-16 ARM Limited and Contributors.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of ARM Limited nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /*
29  * NE10 Library : dsp/NE10_fft.neonintrinsic.h
30  */
31 
32 #ifndef NE10_FFT_NEONINTRINSIC_H
33 #define NE10_FFT_NEONINTRINSIC_H
34 
35 #include "NE10_fft.h"
36 #include <arm_neon.h>
37 
38 #define NE10_CPX_ADD_NEON_F32(Z,A,B) do { \
39  Z.val[0] = A.val[0] + B.val[0]; \
40  Z.val[1] = A.val[1] + B.val[1]; \
41 } while (0);
42 
43 #define NE10_CPX_SUB_NEON_F32(Z,A,B) do { \
44  Z.val[0] = A.val[0] - B.val[0]; \
45  Z.val[1] = A.val[1] - B.val[1]; \
46 } while (0);
47 
48 #define NE10_CPX_MUL_NEON_F32(Z,A,B) do { \
49  float32x4_t ARBR = vmulq_f32( A.val[0], B.val[0] ); \
50  float32x4_t ARBI = vmulq_f32( A.val[0], B.val[1] ); \
51  Z.val[0] = vmlsq_f32(ARBR, A.val[1], B.val[1]); \
52  Z.val[1] = vmlaq_f32(ARBI, A.val[1], B.val[0]); \
53 } while (0);
54 
55 #define NE10_CPX_MUL_INV_NEON_F32(Z,A,B) do { \
56  float32x4_t ARBR = vmulq_f32( A.val[0], B.val[0] ); \
57  float32x4_t AIBI = vmulq_f32( A.val[1], B.val[1] ); \
58  float32x4_t ARBI = vmulq_f32( A.val[0], B.val[1] ); \
59  float32x4_t AIBR = vmulq_f32( A.val[1], B.val[0] ); \
60  Z.val[0] = vaddq_f32(ARBR,AIBI); \
61  Z.val[1] = vsubq_f32(AIBR,ARBI); \
62 } while (0);
63 
64 #define NE10_BUTTERFLY_NEON_F32(O1,O2,I1,I2) do { \
65  NE10_CPX_ADD_NEON(O1,I1,I2); \
66  NE10_CPX_SUB_NEON(O2,I1,I2); \
67 } while(0);
68 
69 #define NE10_DECLARE_2(TYPE,NAME) TYPE NAME ## 0; \
70  TYPE NAME ## 1;
71 
72 #define NE10_DECLARE_3(TYPE,NAME) NE10_DECLARE_2(TYPE,NAME); \
73  TYPE NAME ## 2;
74 
75 #define NE10_DECLARE_4(TYPE,NAME) NE10_DECLARE_3(TYPE,NAME); \
76  TYPE NAME ## 3;
77 
78 #define NE10_DECLARE_8(TYPE,NAME) NE10_DECLARE_4(TYPE,NAME); \
79  TYPE NAME ## 4; \
80  TYPE NAME ## 5; \
81  TYPE NAME ## 6; \
82  TYPE NAME ## 7;
83 
84 #define NE10_REVERSE_FLOAT32X4(VECTOR4F) do { \
85  VECTOR4F = vrev64q_f32(VECTOR4F); \
86  VECTOR4F = vcombine_f32( vget_high_f32( VECTOR4F ), vget_low_f32( VECTOR4F ) ); \
87 } while (0);
88 
89 #define NE10_REVERSE_OUT_FLOAT32X4(VECTOR4F_OUT,VECTOR4F) do { \
90  float32x4_t Q_TMP = vrev64q_f32(VECTOR4F); \
91  VECTOR4F_OUT = vcombine_f32( vget_high_f32( Q_TMP ), vget_low_f32( Q_TMP ) ); \
92 } while (0);
93 
94 #define NE10_RADIX4X4C_TRANSPOSE_NEON(Q2_OUT,Q2_IN) do { \
95  NE10_DECLARE_4(float32x4x2_t,q2_tmp); \
96  q2_tmp0 = vtrnq_f32 (Q2_IN ## 0 .val[0], Q2_IN ## 1 .val[0]); \
97  q2_tmp1 = vtrnq_f32 (Q2_IN ## 0 .val[1], Q2_IN ## 1 .val[1]); \
98  q2_tmp2 = vtrnq_f32 (Q2_IN ## 2 .val[0], Q2_IN ## 3 .val[0]); \
99  q2_tmp3 = vtrnq_f32 (Q2_IN ## 2 .val[1], Q2_IN ## 3 .val[1]); \
100  Q2_OUT ## 0 .val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[0]), vget_low_f32 (q2_tmp2.val[0])); \
101  Q2_OUT ## 0 .val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[0]), vget_low_f32 (q2_tmp3.val[0])); \
102  Q2_OUT ## 1 .val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[1]), vget_low_f32 (q2_tmp2.val[1])); \
103  Q2_OUT ## 1 .val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[1]), vget_low_f32 (q2_tmp3.val[1])); \
104  Q2_OUT ## 2 .val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[0]), vget_high_f32 (q2_tmp2.val[0])); \
105  Q2_OUT ## 2 .val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[0]), vget_high_f32 (q2_tmp3.val[0])); \
106  Q2_OUT ## 3 .val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[1]), vget_high_f32 (q2_tmp2.val[1])); \
107  Q2_OUT ## 3 .val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[1]), vget_high_f32 (q2_tmp3.val[1])); \
108 } while(0);
109 
110 #define VDUPQ_N_F32(VAR) { VAR, VAR, VAR, VAR }
111 
112 #define CONST_TW_81 0.70710678
113 #define CONST_TW_81N -0.70710678
114 
115 const static float32x4_t Q_TW_81 = VDUPQ_N_F32(CONST_TW_81 );
116 const static float32x4_t Q_TW_81N = VDUPQ_N_F32(CONST_TW_81N);
117 
118 #define DIV_TW81 1.4142136f
119 #define DIV_TW81N -1.4142136f
120 
121 const static float32x4_t DIV_TW81_NEON = VDUPQ_N_F32(DIV_TW81);
122 const static float32x4_t DIV_TW81N_NEON = VDUPQ_N_F32(DIV_TW81N);
123 
124 #define NE10_RADIX8x4_R2C_NEON_KERNEL_S1(Q_OUT,Q_IN) do { \
125  Q_OUT ## 0 = vaddq_f32 (Q_IN ## 0, Q_IN ## 4); \
126  Q_OUT ## 1 = vsubq_f32 (Q_IN ## 0, Q_IN ## 4); \
127  Q_OUT ## 2 = vaddq_f32 (Q_IN ## 1, Q_IN ## 5); \
128  Q_OUT ## 3 = vsubq_f32 (Q_IN ## 1, Q_IN ## 5); \
129  Q_OUT ## 4 = vaddq_f32 (Q_IN ## 2, Q_IN ## 6); \
130  Q_OUT ## 5 = vsubq_f32 (Q_IN ## 2, Q_IN ## 6); \
131  Q_OUT ## 6 = vaddq_f32 (Q_IN ## 3, Q_IN ## 7); \
132  Q_OUT ## 7 = vsubq_f32 (Q_IN ## 3, Q_IN ## 7); \
133  Q_OUT ## 3 = vmulq_f32 (Q_OUT ## 3, Q_TW_81 ); \
134  Q_OUT ## 7 = vmulq_f32 (Q_OUT ## 7, Q_TW_81N); \
135 } while(0);
136 
137 #define NE10_RADIX8x4_R2C_NEON_KERNEL_S2(Q_OUT,Q_IN) do { \
138  NE10_DECLARE_4(float32x4_t,Q_S); \
139  Q_S0 = vaddq_f32 (Q_IN ## 0, Q_IN ## 4); \
140  Q_S1 = vaddq_f32 (Q_IN ## 2, Q_IN ## 6); \
141  Q_S2 = vsubq_f32 (Q_IN ## 7, Q_IN ## 3); \
142  Q_S3 = vaddq_f32 (Q_IN ## 3, Q_IN ## 7); \
143  Q_OUT ## 0 = vaddq_f32 ( Q_S0, Q_S1 ); \
144  Q_OUT ## 1 = vaddq_f32 ( Q_IN ## 1, Q_S3 ); \
145  Q_OUT ## 2 = vsubq_f32 ( Q_S2, Q_IN ## 5 ); \
146  Q_OUT ## 3 = vsubq_f32 ( Q_IN ## 0, Q_IN ## 4 ); \
147  Q_OUT ## 4 = vsubq_f32 ( Q_IN ## 6, Q_IN ## 2 ); \
148  Q_OUT ## 5 = vsubq_f32 ( Q_IN ## 1, Q_S3 ); \
149  Q_OUT ## 6 = vaddq_f32 ( Q_IN ## 5, Q_S2 ); \
150  Q_OUT ## 7 = vsubq_f32 ( Q_S0, Q_S1 ); \
151 } while(0);
152 
153 #define NE10_RADIX8x4_C2R_NEON_KERNEL_S1(Q_OUT,Q_IN) do { \
154  NE10_DECLARE_8(float32x4_t,Q_S_IN); \
155  Q_S_IN0 = vaddq_f32(Q_IN ## 0, Q_IN ## 7); \
156  Q_S_IN1 = vsubq_f32(Q_IN ## 0, Q_IN ## 7); \
157  Q_S_IN2 = vaddq_f32(Q_IN ## 1, Q_IN ## 5); \
158  Q_S_IN3 = vsubq_f32(Q_IN ## 1, Q_IN ## 5); \
159  Q_S_IN4 = vaddq_f32(Q_IN ## 6, Q_IN ## 2); \
160  Q_S_IN5 = vsubq_f32(Q_IN ## 6, Q_IN ## 2); \
161  Q_S_IN6 = vaddq_f32(Q_IN ## 3, Q_IN ## 3); \
162  Q_S_IN7 = vaddq_f32(Q_IN ## 4, Q_IN ## 4); \
163  Q_OUT ## 0 = vaddq_f32(Q_S_IN0, Q_S_IN6); \
164  Q_OUT ## 1 = vaddq_f32(Q_S_IN2, Q_S_IN2); \
165  Q_OUT ## 2 = vsubq_f32(Q_S_IN1, Q_S_IN7); \
166  Q_OUT ## 3 = vsubq_f32(Q_S_IN3, Q_S_IN4); \
167  Q_OUT ## 4 = vsubq_f32(Q_S_IN0, Q_S_IN6); \
168  Q_OUT ## 5 = vaddq_f32(Q_S_IN5, Q_S_IN5); \
169  Q_OUT ## 6 = vaddq_f32(Q_S_IN1, Q_S_IN7); \
170  Q_OUT ## 7 = vaddq_f32(Q_S_IN4, Q_S_IN3); \
171 } while (0);
172 
173 #define NE10_RADIX8x4_C2R_NEON_KERNEL_S2(Q_OUT,Q_IN) do { \
174  Q_IN ## 3 = vmulq_f32(Q_IN ## 3,DIV_TW81_NEON); \
175  Q_IN ## 7 = vmulq_f32(Q_IN ## 7,DIV_TW81N_NEON); \
176  Q_OUT ## 0 = vaddq_f32(Q_IN ## 0, Q_IN ## 1); \
177  Q_OUT ## 4 = vsubq_f32(Q_IN ## 0, Q_IN ## 1); \
178  Q_OUT ## 1 = vaddq_f32(Q_IN ## 2, Q_IN ## 3); \
179  Q_OUT ## 5 = vsubq_f32(Q_IN ## 2, Q_IN ## 3); \
180  Q_OUT ## 2 = vaddq_f32(Q_IN ## 4, Q_IN ## 5); \
181  Q_OUT ## 6 = vsubq_f32(Q_IN ## 4, Q_IN ## 5); \
182  Q_OUT ## 3 = vaddq_f32(Q_IN ## 6, Q_IN ## 7); \
183  Q_OUT ## 7 = vsubq_f32(Q_IN ## 6, Q_IN ## 7); \
184 } while(0);
185 
186 #define NE10_RADIX8x4_C2R_NEON_KERNEL_SCALE(Q_OUT) do { \
187  Q_OUT ## 0 = vmulq_f32( Q_OUT ## 0, EIGH_NEON); \
188  Q_OUT ## 1 = vmulq_f32( Q_OUT ## 1, EIGH_NEON); \
189  Q_OUT ## 2 = vmulq_f32( Q_OUT ## 2, EIGH_NEON); \
190  Q_OUT ## 3 = vmulq_f32( Q_OUT ## 3, EIGH_NEON); \
191  Q_OUT ## 4 = vmulq_f32( Q_OUT ## 4, EIGH_NEON); \
192  Q_OUT ## 5 = vmulq_f32( Q_OUT ## 5, EIGH_NEON); \
193  Q_OUT ## 6 = vmulq_f32( Q_OUT ## 6, EIGH_NEON); \
194  Q_OUT ## 7 = vmulq_f32( Q_OUT ## 7, EIGH_NEON); \
195 } while(0);
196 
197 #define NE10_RADIX4x4_C2R_NEON_KERNEL_SCALE(Q_OUT) do { \
198  Q_OUT ## 0 = vmulq_f32( Q_OUT ## 0, QUAD_NEON); \
199  Q_OUT ## 1 = vmulq_f32( Q_OUT ## 1, QUAD_NEON); \
200  Q_OUT ## 2 = vmulq_f32( Q_OUT ## 2, QUAD_NEON); \
201  Q_OUT ## 3 = vmulq_f32( Q_OUT ## 3, QUAD_NEON); \
202 } while(0);
203 
204 #define NE10_RADIX4x4_C2R_TW_NEON_KERNEL_SCALE(Q2_OUT) do { \
205  Q2_OUT ## 0 .val[0] = vmulq_f32( Q2_OUT ## 0 .val[0], QUAD_NEON); \
206  Q2_OUT ## 1 .val[0] = vmulq_f32( Q2_OUT ## 1 .val[0], QUAD_NEON); \
207  Q2_OUT ## 2 .val[0] = vmulq_f32( Q2_OUT ## 2 .val[0], QUAD_NEON); \
208  Q2_OUT ## 3 .val[0] = vmulq_f32( Q2_OUT ## 3 .val[0], QUAD_NEON); \
209  Q2_OUT ## 0 .val[1] = vmulq_f32( Q2_OUT ## 0 .val[1], QUAD_NEON); \
210  Q2_OUT ## 1 .val[1] = vmulq_f32( Q2_OUT ## 1 .val[1], QUAD_NEON); \
211  Q2_OUT ## 2 .val[1] = vmulq_f32( Q2_OUT ## 2 .val[1], QUAD_NEON); \
212  Q2_OUT ## 3 .val[1] = vmulq_f32( Q2_OUT ## 3 .val[1], QUAD_NEON); \
213 } while(0);
214 
215 #define NE10_RADIX8x4_R2C_NEON_KERNEL(Q_OUT,Q_IN) do { \
216  NE10_DECLARE_8(float32x4_t,Q_S_IN); \
217  NE10_RADIX8x4_R2C_NEON_KERNEL_S1(Q_S_IN,Q_IN); \
218  NE10_RADIX8x4_R2C_NEON_KERNEL_S2(Q_OUT,Q_S_IN); \
219 } while(0);
220 
221 #define NE10_RADIX4x4_R2C_NEON_KERNEL(Q_OUT,Q_IN) do { \
222  NE10_DECLARE_4(float32x4_t,Q_S_IN); \
223  Q_S_IN0 = vaddq_f32 (Q_IN ## 0, Q_IN ## 2); \
224  Q_S_IN1 = vaddq_f32 (Q_IN ## 1, Q_IN ## 3); \
225  Q_OUT ## 0 = vaddq_f32 (Q_S_IN0, Q_S_IN1); \
226  Q_OUT ## 1 = vsubq_f32 (Q_IN##0, Q_IN##2); \
227  Q_OUT ## 2 = vsubq_f32 (Q_IN##3, Q_IN##1); \
228  Q_OUT ## 3 = vsubq_f32 (Q_S_IN0, Q_S_IN1); \
229 } while(0);
230 
231 #define NE10_RADIX4x4_C2R_NEON_KERNEL(Q_OUT,Q_IN) do { \
232  NE10_DECLARE_4(float32x4_t,Q_S_IN); \
233  Q_S_IN0 = vaddq_f32 (Q_IN##0, Q_IN##3); \
234  Q_S_IN1 = vsubq_f32 (Q_IN##0, Q_IN##3); \
235  Q_S_IN2 = vaddq_f32 (Q_IN##1, Q_IN##1); \
236  Q_S_IN3 = vaddq_f32 (Q_IN##2, Q_IN##2); \
237  Q_OUT ## 0 = vaddq_f32 (Q_S_IN0, Q_S_IN2); \
238  Q_OUT ## 1 = vsubq_f32 (Q_S_IN1, Q_S_IN3); \
239  Q_OUT ## 2 = vsubq_f32 (Q_S_IN0, Q_S_IN2); \
240  Q_OUT ## 3 = vaddq_f32 (Q_S_IN1, Q_S_IN3); \
241 } while(0);
242 
243 #define NE10_RADIX8x4_C2R_NEON_KERNEL(Q_OUT,Q_IN) do { \
244  NE10_DECLARE_8(float32x4_t,Q_S_IN_C2R_KERNEL); \
245  NE10_RADIX8x4_C2R_NEON_KERNEL_S1(Q_S_IN_C2R_KERNEL,Q_IN); \
246  NE10_RADIX8x4_C2R_NEON_KERNEL_S2(Q_OUT,Q_S_IN_C2R_KERNEL); \
247 } while(0);
248 
249 #define NE10_RADIX8x4_R2C_NEON_LOAD(PTR_IN,Q_IN,IN_STEP) do { \
250  Q_IN ## 0 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
251  PTR_IN += IN_STEP; \
252  Q_IN ## 1 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
253  PTR_IN += IN_STEP; \
254  Q_IN ## 2 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
255  PTR_IN += IN_STEP; \
256  Q_IN ## 3 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
257  PTR_IN += IN_STEP; \
258  Q_IN ## 4 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
259  PTR_IN += IN_STEP; \
260  Q_IN ## 5 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
261  PTR_IN += IN_STEP; \
262  Q_IN ## 6 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
263  PTR_IN += IN_STEP; \
264  Q_IN ## 7 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
265  PTR_IN += IN_STEP; \
266 } while(0);
267 
268 #define NE10_RADIX4x4_R2C_NEON_LOAD(PTR_IN,Q_IN,IN_STEP) do {\
269  Q_IN ## 0 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
270  PTR_IN += IN_STEP; \
271  Q_IN ## 1 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
272  PTR_IN += IN_STEP; \
273  Q_IN ## 2 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
274  PTR_IN += IN_STEP; \
275  Q_IN ## 3 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
276  PTR_IN += IN_STEP; \
277 } while(0);
278 
279 #define NE10_RADIX8x4_R2C_NEON_STORE(PTR_OUT,Q_OUT,OUT_STEP) do { \
280  vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 0 * OUT_STEP ), Q_OUT ## 0); \
281  vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 1 * OUT_STEP ), Q_OUT ## 1); \
282  vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 2 * OUT_STEP ), Q_OUT ## 2); \
283  vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 3 * OUT_STEP ), Q_OUT ## 3); \
284  vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 4 * OUT_STEP ), Q_OUT ## 4); \
285  vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 5 * OUT_STEP ), Q_OUT ## 5); \
286  vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 6 * OUT_STEP ), Q_OUT ## 6); \
287  vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 7 * OUT_STEP ), Q_OUT ## 7); \
288 } while(0);
289 
290 #define NE10_RADIX4x4_R2C_NEON_STORE(PTR_OUT,Q_OUT,OUT_STEP) do { \
291  vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 0 * OUT_STEP ), Q_OUT ## 0); \
292  vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 1 * OUT_STEP ), Q_OUT ## 1); \
293  vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 2 * OUT_STEP ), Q_OUT ## 2); \
294  vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 3 * OUT_STEP ), Q_OUT ## 3); \
295 } while(0);
296 
297 #define NE10_RADIX4x4_R2C_TW_MUL_NEON(Q2_OUT,Q2_IN,Q2_TW) do { \
298  Q2_OUT ## 0 = Q2_IN ## 0; \
299  NE10_CPX_MUL_NEON_F32(Q2_OUT ## 1,Q2_IN ## 1,Q2_TW ## 0); \
300  NE10_CPX_MUL_NEON_F32(Q2_OUT ## 2,Q2_IN ## 2,Q2_TW ## 1); \
301  NE10_CPX_MUL_NEON_F32(Q2_OUT ## 3,Q2_IN ## 3,Q2_TW ## 2); \
302 } while(0);
303 
304 #define NE10_RADIX4x4_C2R_TW_MUL_NEON(Q2_OUT,Q2_IN,Q2_TW) do { \
305  Q2_OUT ## 0 = Q2_IN ## 0; \
306  NE10_CPX_MUL_INV_NEON_F32(Q2_OUT ## 1,Q2_IN ## 1,Q2_TW ## 0); \
307  NE10_CPX_MUL_INV_NEON_F32(Q2_OUT ## 2,Q2_IN ## 2,Q2_TW ## 1); \
308  NE10_CPX_MUL_INV_NEON_F32(Q2_OUT ## 3,Q2_IN ## 3,Q2_TW ## 2); \
309 } while(0);
310 
311 #define NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S1(Q2_OUT,Q2_IN) do { \
312  NE10_CPX_ADD_NEON_F32(Q2_OUT ## 0,Q2_IN ## 0,Q2_IN ## 2); \
313  NE10_CPX_SUB_NEON_F32(Q2_OUT ## 1,Q2_IN ## 0,Q2_IN ## 2); \
314  NE10_CPX_ADD_NEON_F32(Q2_OUT ## 2,Q2_IN ## 1,Q2_IN ## 3); \
315  NE10_CPX_SUB_NEON_F32(Q2_OUT ## 3,Q2_IN ## 1,Q2_IN ## 3); \
316 } while(0);
317 
318 #define NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S2(Q2_OUT,Q2_IN) do { \
319  Q2_OUT ## 0 .val[0] = vaddq_f32(Q2_IN ## 0 .val[0] , Q2_IN ## 2 .val[0]); \
320  Q2_OUT ## 0 .val[1] = vaddq_f32(Q2_IN ## 0 .val[1] , Q2_IN ## 2 .val[1]); \
321  Q2_OUT ## 2 .val[0] = vsubq_f32(Q2_IN ## 0 .val[0] , Q2_IN ## 2 .val[0]); \
322  Q2_OUT ## 2 .val[1] = vsubq_f32(Q2_IN ## 2 .val[1] , Q2_IN ## 0 .val[1]); \
323  Q2_OUT ## 1 .val[0] = vaddq_f32(Q2_IN ## 1 .val[0] , Q2_IN ## 3 .val[1]); \
324  Q2_OUT ## 1 .val[1] = vsubq_f32(Q2_IN ## 1 .val[1] , Q2_IN ## 3 .val[0]); \
325  Q2_OUT ## 3 .val[0] = vsubq_f32(Q2_IN ## 1 .val[0] , Q2_IN ## 3 .val[1]); \
326  Q2_OUT ## 3 .val[1] = vaddq_f32(Q2_IN ## 3 .val[0] , Q2_IN ## 1 .val[1]); \
327  Q2_OUT ## 3 .val[1] = vnegq_f32(Q2_OUT ## 3 .val[1]); \
328 } while(0);
329 
330 #define NE10_RADIX4x4_R2C_TW_NEON_KERNEL_LAST(Q_OUT,Q_IN) do { \
331  float32x4_t Q_TMP; \
332  Q_IN ## 1 = vmulq_f32(Q_IN ## 1, Q_TW_81); \
333  Q_IN ## 3 = vmulq_f32(Q_IN ## 3, Q_TW_81); \
334  Q_TMP = vsubq_f32(Q_IN ## 1, Q_IN ## 3); \
335  Q_IN ## 3 = vaddq_f32(Q_IN ## 1, Q_IN ## 3); \
336  Q_IN ## 1 = Q_TMP; \
337  Q_OUT ## 0 = vaddq_f32(Q_IN ## 0, Q_IN ## 1); \
338  Q_OUT ## 1 = vaddq_f32(Q_IN ## 2, Q_IN ## 3); \
339  Q_OUT ## 2 = vsubq_f32(Q_IN ## 0, Q_IN ## 1); \
340  Q_OUT ## 3 = vsubq_f32(Q_IN ## 2, Q_IN ## 3); \
341  Q_OUT ## 1 = vnegq_f32(Q_OUT ## 1); \
342 } while(0);
343 
344 #define NE10_RADIX4x4_C2R_TW_NEON_KERNEL_LAST(Q_OUT,Q_IN) do { \
345  float32x4_t Q_TMP; \
346  Q_IN ## 1 = vnegq_f32(Q_IN ## 1 ); \
347  Q_OUT ## 0 = vaddq_f32(Q_IN ## 0, Q_IN ## 2); \
348  Q_OUT ## 1 = vsubq_f32(Q_IN ## 0, Q_IN ## 2); \
349  Q_OUT ## 2 = vaddq_f32(Q_IN ## 1, Q_IN ## 3); \
350  Q_OUT ## 3 = vsubq_f32(Q_IN ## 1, Q_IN ## 3); \
351  Q_TMP = vaddq_f32(Q_OUT ## 1, Q_OUT ## 3); \
352  Q_OUT ## 3 = vsubq_f32(Q_OUT ## 3, Q_OUT ## 1); \
353  Q_OUT ## 1 = Q_TMP; \
354  Q_OUT ## 1 = vmulq_f32( Q_OUT ## 1, DIV_TW81_NEON); \
355  Q_OUT ## 3 = vmulq_f32( Q_OUT ## 3, DIV_TW81_NEON); \
356  Q_OUT ## 0 = vaddq_f32( Q_OUT ## 0, Q_OUT ## 0 ); \
357  Q_OUT ## 2 = vaddq_f32( Q_OUT ## 2, Q_OUT ## 2 ); \
358 } while(0);
359 
360 #define NE10_RADIX4x4_C2R_TW_NEON_KERNEL_S1(Q2_OUT,Q2_IN) do { \
361  Q2_IN ## 3 .val[1] = vnegq_f32(Q2_IN ## 3 .val[1]); \
362  Q2_OUT ## 0 .val[0] = vaddq_f32(Q2_IN ## 0 .val[0] , Q2_IN ## 2 .val[0]); \
363  Q2_OUT ## 0 .val[1] = vsubq_f32(Q2_IN ## 0 .val[1] , Q2_IN ## 2 .val[1]); \
364  Q2_OUT ## 2 .val[0] = vsubq_f32(Q2_IN ## 0 .val[0] , Q2_IN ## 2 .val[0]); \
365  Q2_OUT ## 2 .val[1] = vaddq_f32(Q2_IN ## 2 .val[1] , Q2_IN ## 0 .val[1]); \
366  Q2_OUT ## 1 .val[0] = vaddq_f32(Q2_IN ## 1 .val[0] , Q2_IN ## 3 .val[0]); \
367  Q2_OUT ## 1 .val[1] = vaddq_f32(Q2_IN ## 1 .val[1] , Q2_IN ## 3 .val[1]); \
368  Q2_OUT ## 3 .val[0] = vsubq_f32(Q2_IN ## 3 .val[1] , Q2_IN ## 1 .val[1]); \
369  Q2_OUT ## 3 .val[1] = vsubq_f32(Q2_IN ## 1 .val[0] , Q2_IN ## 3 .val[0]); \
370 } while(0);
371 
372 #define NE10_RADIX4x4_C2R_TW_NEON_KERNEL_S2(Q2_OUT,Q2_IN) do { \
373  NE10_CPX_ADD_NEON_F32(Q2_OUT ## 0,Q2_IN ## 0,Q2_IN ## 1); \
374  NE10_CPX_SUB_NEON_F32(Q2_OUT ## 2,Q2_IN ## 0,Q2_IN ## 1); \
375  NE10_CPX_ADD_NEON_F32(Q2_OUT ## 1,Q2_IN ## 2,Q2_IN ## 3); \
376  NE10_CPX_SUB_NEON_F32(Q2_OUT ## 3,Q2_IN ## 2,Q2_IN ## 3); \
377 } while(0);
378 
379 #define NE10_RADIX4x4_R2C_TW_NEON_KERNEL(Q2_OUT,Q2_IN,Q2_TW) do { \
380  NE10_RADIX4x4_R2C_TW_MUL_NEON(Q2_OUT,Q2_IN,Q2_TW); \
381  NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S1(Q2_IN,Q2_OUT); \
382  NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S2(Q2_OUT,Q2_IN); \
383 } while(0);
384 
385 #define NE10_RADIX4x4_C2R_TW_NEON_KERNEL(Q2_OUT,Q2_IN,Q2_TW) do { \
386  NE10_RADIX4x4_C2R_TW_NEON_KERNEL_S1(Q2_OUT,Q2_IN); \
387  NE10_RADIX4x4_C2R_TW_NEON_KERNEL_S2(Q2_IN,Q2_OUT); \
388  NE10_RADIX4x4_C2R_TW_MUL_NEON(Q2_OUT,Q2_IN,Q2_TW); \
389 } while(0);
390 
391 #ifdef NE10_VERBOSE
392  #define NE10_PRINT_Qx8_VECTOR(Q_VECTOR) do { \
393  fprintf(stderr,"inside %s\n", __FUNCTION__ ); \
394  fprintf(stderr, #Q_VECTOR "\n"); \
395  fprintf(stderr,"0:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 0[0], Q_VECTOR ## 0[1], Q_VECTOR ## 0[2], Q_VECTOR ## 0[3] ); \
396  fprintf(stderr,"1:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 1[0], Q_VECTOR ## 1[1], Q_VECTOR ## 1[2], Q_VECTOR ## 1[3] ); \
397  fprintf(stderr,"2:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 2[0], Q_VECTOR ## 2[1], Q_VECTOR ## 2[2], Q_VECTOR ## 2[3] ); \
398  fprintf(stderr,"3:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 3[0], Q_VECTOR ## 3[1], Q_VECTOR ## 3[2], Q_VECTOR ## 3[3] ); \
399  fprintf(stderr,"4:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 4[0], Q_VECTOR ## 4[1], Q_VECTOR ## 4[2], Q_VECTOR ## 4[3] ); \
400  fprintf(stderr,"5:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 5[0], Q_VECTOR ## 5[1], Q_VECTOR ## 5[2], Q_VECTOR ## 5[3] ); \
401  fprintf(stderr,"6:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 6[0], Q_VECTOR ## 6[1], Q_VECTOR ## 6[2], Q_VECTOR ## 6[3] ); \
402  fprintf(stderr,"7:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 7[0], Q_VECTOR ## 7[1], Q_VECTOR ## 7[2], Q_VECTOR ## 7[3] ); \
403  } while(0);
404  #define NE10_PRINT_Qx4_VECTOR(Q_VECTOR) do { \
405  fprintf(stderr,"inside %s\n", __FUNCTION__ ); \
406  fprintf(stderr, #Q_VECTOR "\n"); \
407  fprintf(stderr,"0:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 0[0], Q_VECTOR ## 0[1], Q_VECTOR ## 0[2], Q_VECTOR ## 0[3] ); \
408  fprintf(stderr,"1:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 1[0], Q_VECTOR ## 1[1], Q_VECTOR ## 1[2], Q_VECTOR ## 1[3] ); \
409  fprintf(stderr,"2:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 2[0], Q_VECTOR ## 2[1], Q_VECTOR ## 2[2], Q_VECTOR ## 2[3] ); \
410  fprintf(stderr,"3:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 3[0], Q_VECTOR ## 3[1], Q_VECTOR ## 3[2], Q_VECTOR ## 3[3] ); \
411  } while(0);
412  #define NE10_PRINT_Q2x4_VECTOR(Q_VECTOR) do { \
413  fprintf(stderr,"inside %s\n", __FUNCTION__ ); \
414  fprintf(stderr, #Q_VECTOR "\n"); \
415  fprintf(stderr,"0R:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 0 .val[0][0], Q_VECTOR ## 0 .val[0][1], Q_VECTOR ## 0 .val[0][2], Q_VECTOR ## 0 .val[0][3] ); \
416  fprintf(stderr,"1R:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 1 .val[0][0], Q_VECTOR ## 1 .val[0][1], Q_VECTOR ## 1 .val[0][2], Q_VECTOR ## 1 .val[0][3] ); \
417  fprintf(stderr,"2R:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 2 .val[0][0], Q_VECTOR ## 2 .val[0][1], Q_VECTOR ## 2 .val[0][2], Q_VECTOR ## 2 .val[0][3] ); \
418  fprintf(stderr,"3R:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 3 .val[0][0], Q_VECTOR ## 3 .val[0][1], Q_VECTOR ## 3 .val[0][2], Q_VECTOR ## 3 .val[0][3] ); \
419  fprintf(stderr,"0I:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 0 .val[1][0], Q_VECTOR ## 0 .val[1][1], Q_VECTOR ## 0 .val[1][2], Q_VECTOR ## 0 .val[1][3] ); \
420  fprintf(stderr,"1I:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 1 .val[1][0], Q_VECTOR ## 1 .val[1][1], Q_VECTOR ## 1 .val[1][2], Q_VECTOR ## 1 .val[1][3] ); \
421  fprintf(stderr,"2I:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 2 .val[1][0], Q_VECTOR ## 2 .val[1][1], Q_VECTOR ## 2 .val[1][2], Q_VECTOR ## 2 .val[1][3] ); \
422  fprintf(stderr,"3I:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 3 .val[1][0], Q_VECTOR ## 3 .val[1][1], Q_VECTOR ## 3 .val[1][2], Q_VECTOR ## 3 .val[1][3] ); \
423  } while(0);
424 #else // NE10_VERBOSE not defined
425  #define NE10_PRINT_Qx8_VECTOR(Q_VECTOR) ;
426  #define NE10_PRINT_Qx4_VECTOR(Q_VECTOR) ;
427  #define NE10_PRINT_Q2x4_VECTOR(Q2_VECTOR) ;
428 #endif // NE10_VERBOSE
429 #endif // header
#define VDUPQ_N_F32(VAR)
#define CONST_TW_81N
#define CONST_TW_81
#define DIV_TW81N
#define DIV_TW81