Project Ne10
An open, optimized software library for the ARM architecture.
NE10_fft_int16.neonintrinsic.c
Go to the documentation of this file.
1 /*
2  * Copyright 2013-16 ARM Limited and Contributors.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of ARM Limited nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /*
29  * NE10 Library : dsp/NE10_fft_int16.neon.c
30  */
31 
32 #include <arm_neon.h>
33 
34 #include "NE10_types.h"
35 #include "NE10_macros.h"
36 #include "NE10_fft.h"
37 
38 static inline void ne10_fft2_forward_int16_unscaled (ne10_fft_cpx_int16_t * Fout,
40 
41 {
42  Fout[0].r = Fin[0].r + Fin[1].r;
43  Fout[0].i = Fin[0].i + Fin[1].i;
44  Fout[1].r = Fin[0].r - Fin[1].r;
45  Fout[1].i = Fin[0].i - Fin[1].i;
46 }
47 
48 static inline void ne10_fft2_backward_int16_unscaled (ne10_fft_cpx_int16_t * Fout,
50 
51 {
52  Fout[0].r = Fin[0].r + Fin[1].r;
53  Fout[0].i = Fin[0].i + Fin[1].i;
54  Fout[1].r = Fin[0].r - Fin[1].r;
55  Fout[1].i = Fin[0].i - Fin[1].i;
56 }
57 
58 static inline void ne10_fft2_forward_int16_scaled (ne10_fft_cpx_int16_t * Fout,
60 
61 {
62  Fout[0].r = (Fin[0].r + Fin[1].r) >> 1;
63  Fout[0].i = (Fin[0].i + Fin[1].i) >> 1;
64  Fout[1].r = (Fin[0].r - Fin[1].r) >> 1;
65  Fout[1].i = (Fin[0].i - Fin[1].i) >> 1;
66 }
67 
68 static inline void ne10_fft2_backward_int16_scaled (ne10_fft_cpx_int16_t * Fout,
70 
71 {
72  Fout[0].r = (Fin[0].r + Fin[1].r) >> 1;
73  Fout[0].i = (Fin[0].i + Fin[1].i) >> 1;
74  Fout[1].r = (Fin[0].r - Fin[1].r) >> 1;
75  Fout[1].i = (Fin[0].i - Fin[1].i) >> 1;
76 }
77 
78 #define FFT4_FS_START \
79  ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i; \
80  ne10_int16_t tmp_r, tmp_i;
81 
82 #define FFT4_FS \
83  s2_r = Fin[0].r - Fin[2].r; \
84  s2_i = Fin[0].i - Fin[2].i; \
85  tmp_r = Fin[0].r + Fin[2].r; \
86  tmp_i = Fin[0].i + Fin[2].i; \
87  s0_r = Fin[1].r + Fin[3].r; \
88  s0_i = Fin[1].i + Fin[3].i; \
89  s1_r = Fin[1].r - Fin[3].r; \
90  s1_i = Fin[1].i - Fin[3].i;
91 
92 #define FFT4_FS_SCALED \
93  s2_r = (Fin[0].r - Fin[2].r) >> 2; \
94  s2_i = (Fin[0].i - Fin[2].i) >> 2; \
95  tmp_r = (Fin[0].r + Fin[2].r) >> 2; \
96  tmp_i = (Fin[0].i + Fin[2].i) >> 2; \
97  s0_r = (Fin[1].r + Fin[3].r) >> 2; \
98  s0_i = (Fin[1].i + Fin[3].i) >> 2; \
99  s1_r = (Fin[1].r - Fin[3].r) >> 2; \
100  s1_i = (Fin[1].i - Fin[3].i) >> 2;
101 
102 #define FFT4_FWD_LS \
103  Fout[2].r = tmp_r - s0_r; \
104  Fout[2].i = tmp_i - s0_i; \
105  Fout[0].r = tmp_r + s0_r; \
106  Fout[0].i = tmp_i + s0_i; \
107  Fout[1].r = s2_r + s1_i; \
108  Fout[1].i = s2_i - s1_r; \
109  Fout[3].r = s2_r - s1_i; \
110  Fout[3].i = s2_i + s1_r;
111 
112 #define FFT4_INV_LS \
113  Fout[2].r = tmp_r - s0_r; \
114  Fout[2].i = tmp_i - s0_i; \
115  Fout[0].r = tmp_r + s0_r; \
116  Fout[0].i = tmp_i + s0_i; \
117  Fout[1].r = s2_r - s1_i; \
118  Fout[1].i = s2_i + s1_r; \
119  Fout[3].r = s2_r + s1_i; \
120  Fout[3].i = s2_i - s1_r;
121 
122 static inline void ne10_fft4_forward_int16_unscaled (ne10_fft_cpx_int16_t * Fout,
123  ne10_fft_cpx_int16_t * Fin)
124 
125 {
127  FFT4_FS
129 }
130 
131 static inline void ne10_fft4_backward_int16_unscaled (ne10_fft_cpx_int16_t * Fout,
132  ne10_fft_cpx_int16_t * Fin)
133 
134 {
136  FFT4_FS
138 }
139 static inline void ne10_fft4_forward_int16_scaled (ne10_fft_cpx_int16_t * Fout,
140  ne10_fft_cpx_int16_t * Fin)
141 
142 {
146 }
147 
148 static inline void ne10_fft4_backward_int16_scaled (ne10_fft_cpx_int16_t * Fout,
149  ne10_fft_cpx_int16_t * Fin)
150 
151 {
155 }
156 
157 #define FFT8_FS_START \
158  ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i; \
159  ne10_int16_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i; \
160  const ne10_int16_t TW_81 = 23169;
161 
162 #define FFT8_FS \
163  s0_r = Fin[0].r + Fin[4].r; \
164  s0_i = Fin[0].i + Fin[4].i; \
165  s1_r = Fin[0].r - Fin[4].r; \
166  s1_i = Fin[0].i - Fin[4].i; \
167  s2_r = Fin[1].r + Fin[5].r; \
168  s2_i = Fin[1].i + Fin[5].i; \
169  s3_r = Fin[1].r - Fin[5].r; \
170  s3_i = Fin[1].i - Fin[5].i; \
171  s4_r = Fin[2].r + Fin[6].r; \
172  s4_i = Fin[2].i + Fin[6].i; \
173  s5_r = Fin[2].r - Fin[6].r; \
174  s5_i = Fin[2].i - Fin[6].i; \
175  s6_r = Fin[3].r + Fin[7].r; \
176  s6_i = Fin[3].i + Fin[7].i; \
177  s7_r = Fin[3].r - Fin[7].r; \
178  s7_i = Fin[3].i - Fin[7].i;
179 
180 #define FFT8_FS_SCALED \
181  s0_r = (Fin[0].r + Fin[4].r) >> 3; \
182  s0_i = (Fin[0].i + Fin[4].i) >> 3; \
183  s1_r = (Fin[0].r - Fin[4].r) >> 3; \
184  s1_i = (Fin[0].i - Fin[4].i) >> 3; \
185  s2_r = (Fin[1].r + Fin[5].r) >> 3; \
186  s2_i = (Fin[1].i + Fin[5].i) >> 3; \
187  s3_r = (Fin[1].r - Fin[5].r) >> 3; \
188  s3_i = (Fin[1].i - Fin[5].i) >> 3; \
189  s4_r = (Fin[2].r + Fin[6].r) >> 3; \
190  s4_i = (Fin[2].i + Fin[6].i) >> 3; \
191  s5_r = (Fin[2].r - Fin[6].r) >> 3; \
192  s5_i = (Fin[2].i - Fin[6].i) >> 3; \
193  s6_r = (Fin[3].r + Fin[7].r) >> 3; \
194  s6_i = (Fin[3].i + Fin[7].i) >> 3; \
195  s7_r = (Fin[3].r - Fin[7].r) >> 3; \
196  s7_i = (Fin[3].i - Fin[7].i) >> 3;
197 
198 
199 #define FFT8_FWD_LS \
200  t0_r = s0_r - s4_r; \
201  t0_i = s0_i - s4_i; \
202  t1_r = s0_r + s4_r; \
203  t1_i = s0_i + s4_i; \
204  t2_r = s2_r + s6_r; \
205  t2_i = s2_i + s6_i; \
206  t3_r = s2_r - s6_r; \
207  t3_i = s2_i - s6_i; \
208  Fout[0].r = t1_r + t2_r; \
209  Fout[0].i = t1_i + t2_i; \
210  Fout[4].r = t1_r - t2_r; \
211  Fout[4].i = t1_i - t2_i; \
212  Fout[2].r = t0_r + t3_i; \
213  Fout[2].i = t0_i - t3_r; \
214  Fout[6].r = t0_r - t3_i; \
215  Fout[6].i = t0_i + t3_r; \
216  t4_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r + s3_i) * TW_81) >> NE10_F2I16_SHIFT); \
217  t4_i = - (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r - s3_i) * TW_81) >> NE10_F2I16_SHIFT); \
218  t5_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r - s7_i) * TW_81) >> NE10_F2I16_SHIFT); \
219  t5_i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r + s7_i) * TW_81) >> NE10_F2I16_SHIFT); \
220  t0_r = s1_r - s5_i; \
221  t0_i = s1_i + s5_r; \
222  t1_r = s1_r + s5_i; \
223  t1_i = s1_i - s5_r; \
224  t2_r = t4_r - t5_r; \
225  t2_i = t4_i - t5_i; \
226  t3_r = t4_r + t5_r; \
227  t3_i = t4_i + t5_i; \
228  Fout[1].r = t1_r + t2_r; \
229  Fout[1].i = t1_i + t2_i; \
230  Fout[5].r = t1_r - t2_r; \
231  Fout[5].i = t1_i - t2_i; \
232  Fout[3].r = t0_r + t3_i; \
233  Fout[3].i = t0_i - t3_r; \
234  Fout[7].r = t0_r - t3_i; \
235  Fout[7].i = t0_i + t3_r;
236 
237 #define FFT8_INV_LS \
238  t0_r = s0_r - s4_r; \
239  t0_i = s0_i - s4_i; \
240  t1_r = s0_r + s4_r; \
241  t1_i = s0_i + s4_i; \
242  t2_r = s2_r + s6_r; \
243  t2_i = s2_i + s6_i; \
244  t3_r = s2_r - s6_r; \
245  t3_i = s2_i - s6_i; \
246  Fout[0].r = t1_r + t2_r; \
247  Fout[0].i = t1_i + t2_i; \
248  Fout[4].r = t1_r - t2_r; \
249  Fout[4].i = t1_i - t2_i; \
250  Fout[2].r = t0_r - t3_i; \
251  Fout[2].i = t0_i + t3_r; \
252  Fout[6].r = t0_r + t3_i; \
253  Fout[6].i = t0_i - t3_r; \
254  t4_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r - s3_i) * TW_81) >> NE10_F2I16_SHIFT); \
255  t4_i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r + s3_i) * TW_81) >> NE10_F2I16_SHIFT); \
256  t5_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r + s7_i) * TW_81) >> NE10_F2I16_SHIFT); \
257  t5_i = - (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r - s7_i) * TW_81) >> NE10_F2I16_SHIFT); \
258  t0_r = s1_r + s5_i; \
259  t0_i = s1_i - s5_r; \
260  t1_r = s1_r - s5_i; \
261  t1_i = s1_i + s5_r; \
262  t2_r = t4_r - t5_r; \
263  t2_i = t4_i - t5_i; \
264  t3_r = t4_r + t5_r; \
265  t3_i = t4_i + t5_i; \
266  Fout[1].r = t1_r + t2_r; \
267  Fout[1].i = t1_i + t2_i; \
268  Fout[5].r = t1_r - t2_r; \
269  Fout[5].i = t1_i - t2_i; \
270  Fout[3].r = t0_r - t3_i; \
271  Fout[3].i = t0_i + t3_r; \
272  Fout[7].r = t0_r + t3_i; \
273  Fout[7].i = t0_i - t3_r;
274 
275 static inline void ne10_fft8_forward_int16_unscaled (ne10_fft_cpx_int16_t * Fout,
276  ne10_fft_cpx_int16_t * Fin)
277 
278 {
280  FFT8_FS
282 }
283 
284 static inline void ne10_fft8_backward_int16_unscaled (ne10_fft_cpx_int16_t * Fout,
285  ne10_fft_cpx_int16_t * Fin)
286 
287 {
289  FFT8_FS
291 }
292 static inline void ne10_fft8_forward_int16_scaled (ne10_fft_cpx_int16_t * Fout,
293  ne10_fft_cpx_int16_t * Fin)
294 
295 {
299 }
300 
301 static inline void ne10_fft8_backward_int16_scaled (ne10_fft_cpx_int16_t * Fout,
302  ne10_fft_cpx_int16_t * Fin)
303 
304 {
308 }
309 
310 #define RADIX8x4_START \
311  ne10_int32_t f_count; \
312  ne10_int32_t src_step = stride << 1; \
313  const ne10_int16_t TW_81 = 23169; \
314  const ne10_int16_t TW_81N = -23169; \
315  int16_t *p_src, *p_dst; \
316  int16x4x2_t d2_in0, d2_in1, d2_in2, d2_in3, d2_in4, d2_in5, d2_in6, d2_in7; \
317  int16x4_t d_sin0_r, d_sin0_i, d_sin1_r, d_sin1_i, d_sin2_r, d_sin2_i, d_sin3_r, d_sin3_i; \
318  int16x4_t d_sin4_r, d_sin4_i, d_sin5_r, d_sin5_i, d_sin6_r, d_sin6_i, d_sin7_r, d_sin7_i; \
319  int16x4_t d_s3_r, d_s3_i, d_s5_r, d_s5_i, d_s7_r, d_s7_i; \
320  int16x4_t d_s8_r, d_s8_i, d_s9_r, d_s9_i, d_s10_r, d_s10_i, d_s11_r, d_s11_i; \
321  int16x4_t d_s12_r, d_s12_i, d_s13_r, d_s13_i, d_s14_r, d_s14_i, d_s15_r, d_s15_i; \
322  int16x4_t d_out0_r, d_out0_i, d_out1_r, d_out1_i, d_out2_r, d_out2_i, d_out3_r, d_out3_i; \
323  int16x4_t d_out4_r, d_out4_i, d_out5_r, d_out5_i, d_out6_r, d_out6_i, d_out7_r, d_out7_i; \
324  int16x4x2_t d2_out0, d2_out1, d2_out2, d2_out3, d2_out4, d2_out5, d2_out6, d2_out7; \
325  int16x8x2_t q2_tmp0, q2_tmp1, q2_tmp2, q2_tmp3; \
326  int32x4x2_t q2_tmp4, q2_tmp5, q2_tmp6, q2_tmp7; \
327  int16x4_t d_tw_81, d_tw_81n; \
328  p_src = (int16_t *) Fin; \
329  p_dst = (int16_t *) Fout;
330 
331 
332 #define RADIX8x4_LOAD \
333  d2_in0 = vld2_s16 (p_src); \
334  p_src += src_step; \
335  d2_in2 = vld2_s16 (p_src); \
336  p_src += src_step; \
337  d2_in4 = vld2_s16 (p_src); \
338  p_src += src_step; \
339  d2_in6 = vld2_s16 (p_src); \
340  p_src += src_step; \
341  d2_in1 = vld2_s16 (p_src); \
342  p_src += src_step; \
343  d2_in3 = vld2_s16 (p_src); \
344  p_src += src_step; \
345  d2_in5 = vld2_s16 (p_src); \
346  p_src += src_step; \
347  d2_in7 = vld2_s16 (p_src); \
348  p_src += src_step;
349 
350 #define RADIX8x4_STORE \
351  q2_tmp0 = vtrnq_s16 (vcombine_s16(d_out0_r, d_out0_i), vcombine_s16(d_out1_r, d_out1_i)); \
352  q2_tmp1 = vtrnq_s16 (vcombine_s16(d_out2_r, d_out2_i), vcombine_s16(d_out3_r, d_out3_i)); \
353  q2_tmp2 = vtrnq_s16 (vcombine_s16(d_out4_r, d_out4_i), vcombine_s16(d_out5_r, d_out5_i)); \
354  q2_tmp3 = vtrnq_s16 (vcombine_s16(d_out6_r, d_out6_i), vcombine_s16(d_out7_r, d_out7_i)); \
355  q2_tmp4 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp0.val[0]), vreinterpretq_s32_s16(q2_tmp1.val[0])); \
356  q2_tmp5 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp0.val[1]), vreinterpretq_s32_s16(q2_tmp1.val[1])); \
357  q2_tmp6 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp2.val[0]), vreinterpretq_s32_s16(q2_tmp3.val[0])); \
358  q2_tmp7 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp2.val[1]), vreinterpretq_s32_s16(q2_tmp3.val[1])); \
359  d2_out0.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp4.val[0])); \
360  d2_out0.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp4.val[0])); \
361  d2_out1.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp6.val[0])); \
362  d2_out1.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp6.val[0])); \
363  d2_out2.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp5.val[0])); \
364  d2_out2.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp5.val[0])); \
365  d2_out3.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp7.val[0])); \
366  d2_out3.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp7.val[0])); \
367  d2_out4.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp4.val[1])); \
368  d2_out4.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp4.val[1])); \
369  d2_out5.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp6.val[1])); \
370  d2_out5.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp6.val[1])); \
371  d2_out6.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp5.val[1])); \
372  d2_out6.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp5.val[1])); \
373  d2_out7.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp7.val[1])); \
374  d2_out7.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp7.val[1])); \
375  vst2_s16 (p_dst, d2_out0); \
376  p_dst += 8; \
377  vst2_s16 (p_dst, d2_out1); \
378  p_dst += 8; \
379  vst2_s16 (p_dst, d2_out2); \
380  p_dst += 8; \
381  vst2_s16 (p_dst, d2_out3); \
382  p_dst += 8; \
383  vst2_s16 (p_dst, d2_out4); \
384  p_dst += 8; \
385  vst2_s16 (p_dst, d2_out5); \
386  p_dst += 8; \
387  vst2_s16 (p_dst, d2_out6); \
388  p_dst += 8; \
389  vst2_s16 (p_dst, d2_out7); \
390  p_dst += 8; \
391  p_src = p_src - src_step * 8 + 8;
392 
393 #define RADIX8x4_FS_S0 \
394  d_sin0_r = vadd_s16 (d2_in0.val[0], d2_in1.val[0]); \
395  d_sin0_i = vadd_s16 (d2_in0.val[1], d2_in1.val[1]); \
396  d_sin1_r = vsub_s16 (d2_in0.val[0], d2_in1.val[0]); \
397  d_sin1_i = vsub_s16 (d2_in0.val[1], d2_in1.val[1]); \
398  d_sin2_r = vadd_s16 (d2_in2.val[0], d2_in3.val[0]); \
399  d_sin2_i = vadd_s16 (d2_in2.val[1], d2_in3.val[1]); \
400  d_sin3_r = vsub_s16 (d2_in2.val[0], d2_in3.val[0]); \
401  d_sin3_i = vsub_s16 (d2_in2.val[1], d2_in3.val[1]); \
402  d_sin4_r = vadd_s16 (d2_in4.val[0], d2_in5.val[0]); \
403  d_sin4_i = vadd_s16 (d2_in4.val[1], d2_in5.val[1]); \
404  d_sin5_r = vsub_s16 (d2_in4.val[0], d2_in5.val[0]); \
405  d_sin5_i = vsub_s16 (d2_in4.val[1], d2_in5.val[1]); \
406  d_sin6_r = vadd_s16 (d2_in6.val[0], d2_in7.val[0]); \
407  d_sin6_i = vadd_s16 (d2_in6.val[1], d2_in7.val[1]); \
408  d_sin7_r = vsub_s16 (d2_in6.val[0], d2_in7.val[0]); \
409  d_sin7_i = vsub_s16 (d2_in6.val[1], d2_in7.val[1]);
410 
411 #define RADIX8x4_FWD_S357 \
412  d_tw_81 = vdup_n_s16 (TW_81); \
413  d_tw_81n = vdup_n_s16 (TW_81N); \
414  d_s5_r = d_sin5_i; \
415  d_s5_i = vneg_s16 (d_sin5_r); \
416  d_s3_r = vadd_s16 (d_sin3_r, d_sin3_i); \
417  d_s3_i = vsub_s16 (d_sin3_i, d_sin3_r); \
418  d_s7_r = vsub_s16 (d_sin7_r, d_sin7_i); \
419  d_s7_i = vadd_s16 (d_sin7_i, d_sin7_r); \
420  d_s3_r = vqdmulh_s16 (d_s3_r, d_tw_81); \
421  d_s3_i = vqdmulh_s16 (d_s3_i, d_tw_81); \
422  d_s7_r = vqdmulh_s16 (d_s7_r, d_tw_81n); \
423  d_s7_i = vqdmulh_s16 (d_s7_i, d_tw_81n);
424 
425 #define RADIX8x4_INV_S357 \
426  d_tw_81 = vdup_n_s16 (TW_81); \
427  d_tw_81n = vdup_n_s16 (TW_81N); \
428  d_s5_r = vneg_s16 (d_sin5_i); \
429  d_s5_i = d_sin5_r; \
430  d_s3_r = vsub_s16 (d_sin3_r, d_sin3_i); \
431  d_s3_i = vadd_s16 (d_sin3_i, d_sin3_r); \
432  d_s7_r = vadd_s16 (d_sin7_r, d_sin7_i); \
433  d_s7_i = vsub_s16 (d_sin7_i, d_sin7_r); \
434  d_s3_r = vqdmulh_s16 (d_s3_r, d_tw_81); \
435  d_s3_i = vqdmulh_s16 (d_s3_i, d_tw_81); \
436  d_s7_r = vqdmulh_s16 (d_s7_r, d_tw_81n); \
437  d_s7_i = vqdmulh_s16 (d_s7_i, d_tw_81n);
438 
439 #define RADIX8x4_LS_02 \
440  d_s8_r = vadd_s16 (d_sin0_r, d_sin4_r); \
441  d_s8_i = vadd_s16 (d_sin0_i, d_sin4_i); \
442  d_s9_r = vadd_s16 (d_sin1_r, d_s5_r); \
443  d_s9_i = vadd_s16 (d_sin1_i, d_s5_i); \
444  d_s10_r = vsub_s16 (d_sin0_r, d_sin4_r); \
445  d_s10_i = vsub_s16 (d_sin0_i, d_sin4_i); \
446  d_s11_r = vsub_s16 (d_sin1_r, d_s5_r); \
447  d_s11_i = vsub_s16 (d_sin1_i, d_s5_i); \
448  d_s12_r = vadd_s16 (d_sin2_r, d_sin6_r); \
449  d_s12_i = vadd_s16 (d_sin2_i, d_sin6_i); \
450  d_s13_r = vadd_s16 (d_s3_r, d_s7_r); \
451  d_s13_i = vadd_s16 (d_s3_i, d_s7_i); \
452  d_s14_r = vsub_s16 (d_sin2_r, d_sin6_r); \
453  d_s14_i = vsub_s16 (d_sin2_i, d_sin6_i); \
454  d_s15_r = vsub_s16 (d_s3_r, d_s7_r); \
455  d_s15_i = vsub_s16 (d_s3_i, d_s7_i); \
456  d_out4_r = vsub_s16 (d_s8_r, d_s12_r); \
457  d_out4_i = vsub_s16 (d_s8_i, d_s12_i); \
458  d_out5_r = vsub_s16 (d_s9_r, d_s13_r); \
459  d_out5_i = vsub_s16 (d_s9_i, d_s13_i); \
460  d_out0_r = vadd_s16 (d_s8_r, d_s12_r); \
461  d_out0_i = vadd_s16 (d_s8_i, d_s12_i); \
462  d_out1_r = vadd_s16 (d_s9_r, d_s13_r); \
463  d_out1_i = vadd_s16 (d_s9_i, d_s13_i);
464 
465 #define RADIX8x4_FS_S0_SCALED \
466  d_sin0_r = vhadd_s16 (d2_in0.val[0], d2_in1.val[0]); \
467  d_sin0_i = vhadd_s16 (d2_in0.val[1], d2_in1.val[1]); \
468  d_sin1_r = vhsub_s16 (d2_in0.val[0], d2_in1.val[0]); \
469  d_sin1_i = vhsub_s16 (d2_in0.val[1], d2_in1.val[1]); \
470  d_sin2_r = vhadd_s16 (d2_in2.val[0], d2_in3.val[0]); \
471  d_sin2_i = vhadd_s16 (d2_in2.val[1], d2_in3.val[1]); \
472  d_sin3_r = vhsub_s16 (d2_in2.val[0], d2_in3.val[0]); \
473  d_sin3_i = vhsub_s16 (d2_in2.val[1], d2_in3.val[1]); \
474  d_sin4_r = vhadd_s16 (d2_in4.val[0], d2_in5.val[0]); \
475  d_sin4_i = vhadd_s16 (d2_in4.val[1], d2_in5.val[1]); \
476  d_sin5_r = vhsub_s16 (d2_in4.val[0], d2_in5.val[0]); \
477  d_sin5_i = vhsub_s16 (d2_in4.val[1], d2_in5.val[1]); \
478  d_sin6_r = vhadd_s16 (d2_in6.val[0], d2_in7.val[0]); \
479  d_sin6_i = vhadd_s16 (d2_in6.val[1], d2_in7.val[1]); \
480  d_sin7_r = vhsub_s16 (d2_in6.val[0], d2_in7.val[0]); \
481  d_sin7_i = vhsub_s16 (d2_in6.val[1], d2_in7.val[1]);
482 
483 #define RADIX8x4_LS_02_SCALED \
484  d_s8_r = vhadd_s16 (d_sin0_r, d_sin4_r); \
485  d_s8_i = vhadd_s16 (d_sin0_i, d_sin4_i); \
486  d_s9_r = vhadd_s16 (d_sin1_r, d_s5_r); \
487  d_s9_i = vhadd_s16 (d_sin1_i, d_s5_i); \
488  d_s10_r = vhsub_s16 (d_sin0_r, d_sin4_r); \
489  d_s10_i = vhsub_s16 (d_sin0_i, d_sin4_i); \
490  d_s11_r = vhsub_s16 (d_sin1_r, d_s5_r); \
491  d_s11_i = vhsub_s16 (d_sin1_i, d_s5_i); \
492  d_s12_r = vhadd_s16 (d_sin2_r, d_sin6_r); \
493  d_s12_i = vhadd_s16 (d_sin2_i, d_sin6_i); \
494  d_s13_r = vhadd_s16 (d_s3_r, d_s7_r); \
495  d_s13_i = vhadd_s16 (d_s3_i, d_s7_i); \
496  d_s14_r = vhsub_s16 (d_sin2_r, d_sin6_r); \
497  d_s14_i = vhsub_s16 (d_sin2_i, d_sin6_i); \
498  d_s15_r = vhsub_s16 (d_s3_r, d_s7_r); \
499  d_s15_i = vhsub_s16 (d_s3_i, d_s7_i); \
500  d_out4_r = vhsub_s16 (d_s8_r, d_s12_r); \
501  d_out4_i = vhsub_s16 (d_s8_i, d_s12_i); \
502  d_out5_r = vhsub_s16 (d_s9_r, d_s13_r); \
503  d_out5_i = vhsub_s16 (d_s9_i, d_s13_i); \
504  d_out0_r = vhadd_s16 (d_s8_r, d_s12_r); \
505  d_out0_i = vhadd_s16 (d_s8_i, d_s12_i); \
506  d_out1_r = vhadd_s16 (d_s9_r, d_s13_r); \
507  d_out1_i = vhadd_s16 (d_s9_i, d_s13_i);
508 
509 
510 static inline void ne10_radix8x4_forward_unscaled_neon (ne10_fft_cpx_int16_t * Fout,
511  ne10_fft_cpx_int16_t * Fin,
512  ne10_int32_t stride)
513 {
515 
516  for (f_count = 0; f_count < stride; f_count += 4)
517  {
520 
521 
522  // radix 4 butterfly without twiddles
525 
526  d_out2_r = vadd_s16 (d_s10_r, d_s14_i);
527  d_out2_i = vsub_s16 (d_s10_i, d_s14_r);
528  d_out3_r = vadd_s16 (d_s11_r, d_s15_i);
529  d_out3_i = vsub_s16 (d_s11_i, d_s15_r);
530  d_out6_r = vsub_s16 (d_s10_r, d_s14_i);
531  d_out6_i = vadd_s16 (d_s10_i, d_s14_r);
532  d_out7_r = vsub_s16 (d_s11_r, d_s15_i);
533  d_out7_i = vadd_s16 (d_s11_i, d_s15_r);
534 
536  } // f_count
537 }
538 
539 static inline void ne10_radix8x4_backward_unscaled_neon (ne10_fft_cpx_int16_t * Fout,
540  ne10_fft_cpx_int16_t * Fin,
541  ne10_int32_t stride)
542 {
544 
545  for (f_count = 0; f_count < stride; f_count += 4)
546  {
549 
550  // radix 4 butterfly without twiddles
553 
554  d_out2_r = vsub_s16 (d_s10_r, d_s14_i);
555  d_out2_i = vadd_s16 (d_s10_i, d_s14_r);
556  d_out3_r = vsub_s16 (d_s11_r, d_s15_i);
557  d_out3_i = vadd_s16 (d_s11_i, d_s15_r);
558  d_out6_r = vadd_s16 (d_s10_r, d_s14_i);
559  d_out6_i = vsub_s16 (d_s10_i, d_s14_r);
560  d_out7_r = vadd_s16 (d_s11_r, d_s15_i);
561  d_out7_i = vsub_s16 (d_s11_i, d_s15_r);
562 
564  } // f_count
565 }
566 static inline void ne10_radix8x4_forward_scaled_neon (ne10_fft_cpx_int16_t * Fout,
567  ne10_fft_cpx_int16_t * Fin,
568  ne10_int32_t stride)
569 {
571 
572  for (f_count = 0; f_count < stride; f_count += 4)
573  {
576 
577  // radix 4 butterfly without twiddles
580 
581  d_out2_r = vhadd_s16 (d_s10_r, d_s14_i);
582  d_out2_i = vhsub_s16 (d_s10_i, d_s14_r);
583  d_out3_r = vhadd_s16 (d_s11_r, d_s15_i);
584  d_out3_i = vhsub_s16 (d_s11_i, d_s15_r);
585  d_out6_r = vhsub_s16 (d_s10_r, d_s14_i);
586  d_out6_i = vhadd_s16 (d_s10_i, d_s14_r);
587  d_out7_r = vhsub_s16 (d_s11_r, d_s15_i);
588  d_out7_i = vhadd_s16 (d_s11_i, d_s15_r);
589 
591  } // f_count
592 }
593 
594 static inline void ne10_radix8x4_backward_scaled_neon (ne10_fft_cpx_int16_t * Fout,
595  ne10_fft_cpx_int16_t * Fin,
596  ne10_int32_t stride)
597 {
599 
600  for (f_count = 0; f_count < stride; f_count += 4)
601  {
604 
605  // radix 4 butterfly without twiddles
608 
609  d_out2_r = vhsub_s16 (d_s10_r, d_s14_i);
610  d_out2_i = vhadd_s16 (d_s10_i, d_s14_r);
611  d_out3_r = vhsub_s16 (d_s11_r, d_s15_i);
612  d_out3_i = vhadd_s16 (d_s11_i, d_s15_r);
613  d_out6_r = vhadd_s16 (d_s10_r, d_s14_i);
614  d_out6_i = vhsub_s16 (d_s10_i, d_s14_r);
615  d_out7_r = vhadd_s16 (d_s11_r, d_s15_i);
616  d_out7_i = vhsub_s16 (d_s11_i, d_s15_r);
617 
619  } // f_count
620 }
621 
622 #define RADIX4x4_WITHOUT_TW_START \
623  ne10_int32_t f_count; \
624  ne10_int32_t src_step = stride << 1; \
625  int16_t *p_src, *p_dst; \
626  int16x4x2_t d2_in0, d2_in1, d2_in2, d2_in3; \
627  int16x4_t d_s0_r, d_s0_i, d_s1_r, d_s1_i, d_s2_r, d_s2_i, d_s3_r, d_s3_i; \
628  int16x4_t d_out0_r, d_out0_i, d_out1_r, d_out1_i, d_out2_r, d_out2_i, d_out3_r, d_out3_i; \
629  int16x4x2_t d2_out0, d2_out1, d2_out2, d2_out3; \
630  int16x8x2_t q2_tmp0, q2_tmp1; \
631  int32x4x2_t q2_tmp2, q2_tmp3; \
632  p_src = (int16_t *) Fin; \
633  p_dst = (int16_t *) Fout;
634 
635 #define RADIX4x4_WITHOUT_TW_LOAD \
636  d2_in0 = vld2_s16 (p_src); \
637  p_src += src_step; \
638  d2_in1 = vld2_s16 (p_src); \
639  p_src += src_step; \
640  d2_in2 = vld2_s16 (p_src); \
641  p_src += src_step; \
642  d2_in3 = vld2_s16 (p_src); \
643  p_src += src_step;
644 
645 #define RADIX4x4_WITHOUT_TW_STORE \
646  q2_tmp0 = vtrnq_s16 (vcombine_s16(d_out0_r, d_out0_i), vcombine_s16(d_out1_r, d_out1_i)); \
647  q2_tmp1 = vtrnq_s16 (vcombine_s16(d_out2_r, d_out2_i), vcombine_s16(d_out3_r, d_out3_i)); \
648  q2_tmp2 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp0.val[0]), vreinterpretq_s32_s16(q2_tmp1.val[0])); \
649  q2_tmp3 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp0.val[1]), vreinterpretq_s32_s16(q2_tmp1.val[1])); \
650  d2_out0.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp2.val[0])); \
651  d2_out0.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp2.val[0])); \
652  d2_out1.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp3.val[0])); \
653  d2_out1.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp3.val[0])); \
654  d2_out2.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp2.val[1])); \
655  d2_out2.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp2.val[1])); \
656  d2_out3.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp3.val[1])); \
657  d2_out3.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp3.val[1])); \
658  vst2_s16 (p_dst, d2_out0); \
659  p_dst += 8; \
660  vst2_s16 (p_dst, d2_out1); \
661  p_dst += 8; \
662  vst2_s16 (p_dst, d2_out2); \
663  p_dst += 8; \
664  vst2_s16 (p_dst, d2_out3); \
665  p_dst += 8; \
666  p_src = p_src - src_step * 4 + 8;
667 
668 #define RADIX4x4_WITHOUT_TW_S0 \
669  d_s0_r = vadd_s16 (d2_in0.val[0], d2_in2.val[0]); \
670  d_s0_i = vadd_s16 (d2_in0.val[1], d2_in2.val[1]); \
671  d_s1_r = vsub_s16 (d2_in0.val[0], d2_in2.val[0]); \
672  d_s1_i = vsub_s16 (d2_in0.val[1], d2_in2.val[1]); \
673  d_s2_r = vadd_s16 (d2_in1.val[0], d2_in3.val[0]); \
674  d_s2_i = vadd_s16 (d2_in1.val[1], d2_in3.val[1]); \
675  d_s3_r = vsub_s16 (d2_in1.val[0], d2_in3.val[0]); \
676  d_s3_i = vsub_s16 (d2_in1.val[1], d2_in3.val[1]); \
677  d_out2_r = vsub_s16 (d_s0_r, d_s2_r); \
678  d_out2_i = vsub_s16 (d_s0_i, d_s2_i); \
679  d_out0_r = vadd_s16 (d_s0_r, d_s2_r); \
680  d_out0_i = vadd_s16 (d_s0_i, d_s2_i);
681 
682 #define RADIX4x4_WITHOUT_TW_S0_SCALED \
683  d_s0_r = vhadd_s16 (d2_in0.val[0], d2_in2.val[0]); \
684  d_s0_i = vhadd_s16 (d2_in0.val[1], d2_in2.val[1]); \
685  d_s1_r = vhsub_s16 (d2_in0.val[0], d2_in2.val[0]); \
686  d_s1_i = vhsub_s16 (d2_in0.val[1], d2_in2.val[1]); \
687  d_s2_r = vhadd_s16 (d2_in1.val[0], d2_in3.val[0]); \
688  d_s2_i = vhadd_s16 (d2_in1.val[1], d2_in3.val[1]); \
689  d_s3_r = vhsub_s16 (d2_in1.val[0], d2_in3.val[0]); \
690  d_s3_i = vhsub_s16 (d2_in1.val[1], d2_in3.val[1]); \
691  d_out2_r = vhsub_s16 (d_s0_r, d_s2_r); \
692  d_out2_i = vhsub_s16 (d_s0_i, d_s2_i); \
693  d_out0_r = vhadd_s16 (d_s0_r, d_s2_r); \
694  d_out0_i = vhadd_s16 (d_s0_i, d_s2_i);
695 
696 
697 static inline void ne10_radix4x4_without_twiddles_forward_unscaled_neon (ne10_fft_cpx_int16_t * Fout,
698  ne10_fft_cpx_int16_t * Fin,
699  ne10_int32_t stride)
700 {
702 
703  for (f_count = 0; f_count < stride; f_count += 4)
704  {
705  // load
707 
708  // radix 4 butterfly without twiddles
710 
711  d_out1_r = vadd_s16 (d_s1_r, d_s3_i);
712  d_out1_i = vsub_s16 (d_s1_i, d_s3_r);
713  d_out3_r = vsub_s16 (d_s1_r, d_s3_i);
714  d_out3_i = vadd_s16 (d_s1_i, d_s3_r);
715 
717  }
718 }
719 
720 static inline void ne10_radix4x4_without_twiddles_backward_unscaled_neon (ne10_fft_cpx_int16_t * Fout,
721  ne10_fft_cpx_int16_t * Fin,
722  ne10_int32_t stride)
723 {
725 
726  for (f_count = 0; f_count < stride; f_count += 4)
727  {
728  // load
730 
731  // radix 4 butterfly without twiddles
733 
734  d_out1_r = vsub_s16 (d_s1_r, d_s3_i);
735  d_out1_i = vadd_s16 (d_s1_i, d_s3_r);
736  d_out3_r = vadd_s16 (d_s1_r, d_s3_i);
737  d_out3_i = vsub_s16 (d_s1_i, d_s3_r);
738 
740  }
741 }
742 
743 static inline void ne10_radix4x4_without_twiddles_forward_scaled_neon (ne10_fft_cpx_int16_t * Fout,
744  ne10_fft_cpx_int16_t * Fin,
745  ne10_int32_t stride)
746 {
748 
749  for (f_count = 0; f_count < stride; f_count += 4)
750  {
751  // load
753 
754  // radix 4 butterfly without twiddles
756 
757  d_out1_r = vhadd_s16 (d_s1_r, d_s3_i);
758  d_out1_i = vhsub_s16 (d_s1_i, d_s3_r);
759  d_out3_r = vhsub_s16 (d_s1_r, d_s3_i);
760  d_out3_i = vhadd_s16 (d_s1_i, d_s3_r);
761 
763  }
764 }
765 
766 static inline void ne10_radix4x4_without_twiddles_backward_scaled_neon (ne10_fft_cpx_int16_t * Fout,
767  ne10_fft_cpx_int16_t * Fin,
768  ne10_int32_t stride)
769 {
771 
772  for (f_count = 0; f_count < stride; f_count += 4)
773  {
774  // load
776 
777  // radix 4 butterfly without twiddles
779 
780  d_out1_r = vhsub_s16 (d_s1_r, d_s3_i);
781  d_out1_i = vhadd_s16 (d_s1_i, d_s3_r);
782  d_out3_r = vhadd_s16 (d_s1_r, d_s3_i);
783  d_out3_i = vhsub_s16 (d_s1_i, d_s3_r);
784 
786  }
787 }
788 
789 #define RADIX4x4_WITH_TW_START \
790  ne10_int32_t m_count; \
791  ne10_int32_t src_step = src_stride << 1; \
792  ne10_int32_t dst_step = dst_stride << 1; \
793  ne10_int32_t tw_step = mstride << 1; \
794  int16_t *p_src, *p_dst, *p_tw; \
795  int16x4x2_t d2_in0, d2_in1, d2_in2, d2_in3; \
796  int16x4x2_t d2_tw0, d2_tw1, d2_tw2; \
797  int16x4_t d_s1_r, d_s1_i, d_s2_r, d_s2_i, d_s3_r, d_s3_i; \
798  int16x4_t d_tmp0, d_tmp1, d_tmp2, d_tmp3, d_tmp4, d_tmp5; \
799  int16x4_t d_s4_r, d_s4_i, d_s5_r, d_s5_i, d_s6_r, d_s6_i, d_s7_r, d_s7_i; \
800  int16x4x2_t d2_out0, d2_out1, d2_out2, d2_out3; \
801  p_src = (int16_t *) Fin; \
802  p_dst = (int16_t *) Fout; \
803  p_tw = (int16_t *) tw;
804 
805 #define RADIX4x4_WITH_TW_LOAD \
806  d2_in0 = vld2_s16 (p_src); \
807  p_src += src_step; \
808  d2_in1 = vld2_s16 (p_src); \
809  p_src += src_step; \
810  d2_in2 = vld2_s16 (p_src); \
811  p_src += src_step; \
812  d2_in3 = vld2_s16 (p_src); \
813  p_src += src_step; \
814  d2_tw0 = vld2_s16 (p_tw); \
815  p_tw += tw_step; \
816  d2_tw1 = vld2_s16 (p_tw); \
817  p_tw += tw_step; \
818  d2_tw2 = vld2_s16 (p_tw); \
819  d_s1_r = vqdmulh_s16 (d2_in1.val[0], d2_tw0.val[0]); \
820  d_s1_i = vqdmulh_s16 (d2_in1.val[1], d2_tw0.val[0]); \
821  d_s2_r = vqdmulh_s16 (d2_in2.val[0], d2_tw1.val[0]); \
822  d_s2_i = vqdmulh_s16 (d2_in2.val[1], d2_tw1.val[0]); \
823  d_s3_r = vqdmulh_s16 (d2_in3.val[0], d2_tw2.val[0]); \
824  d_s3_i = vqdmulh_s16 (d2_in3.val[1], d2_tw2.val[0]); \
825  d_tmp0 = vqdmulh_s16 (d2_in1.val[1], d2_tw0.val[1]); \
826  d_tmp1 = vqdmulh_s16 (d2_in1.val[0], d2_tw0.val[1]); \
827  d_tmp2 = vqdmulh_s16 (d2_in2.val[1], d2_tw1.val[1]); \
828  d_tmp3 = vqdmulh_s16 (d2_in2.val[0], d2_tw1.val[1]); \
829  d_tmp4 = vqdmulh_s16 (d2_in3.val[1], d2_tw2.val[1]); \
830  d_tmp5 = vqdmulh_s16 (d2_in3.val[0], d2_tw2.val[1]);
831 
832 #define RADIX4x4_WITH_TW_STORE \
833  vst2_s16 (p_dst, d2_out0); \
834  p_dst += dst_step; \
835  vst2_s16 (p_dst, d2_out1); \
836  p_dst += dst_step; \
837  vst2_s16 (p_dst, d2_out2); \
838  p_dst += dst_step; \
839  vst2_s16 (p_dst, d2_out3); \
840  p_dst += dst_step; \
841  p_src = p_src - src_step * 4 + 8; \
842  p_dst = p_dst - dst_step * 4 + 8; \
843  p_tw = p_tw - tw_step * 2 + 8;
844 
845 #define RADIX4x4_WITH_TW_S1_FWD \
846  d_s1_r = vsub_s16 (d_s1_r, d_tmp0); \
847  d_s1_i = vadd_s16 (d_s1_i, d_tmp1); \
848  d_s2_r = vsub_s16 (d_s2_r, d_tmp2); \
849  d_s2_i = vadd_s16 (d_s2_i, d_tmp3); \
850  d_s3_r = vsub_s16 (d_s3_r, d_tmp4); \
851  d_s3_i = vadd_s16 (d_s3_i, d_tmp5);
852 
853 #define RADIX4x4_WITH_TW_S1_INV \
854  d_s1_r = vadd_s16 (d_s1_r, d_tmp0); \
855  d_s1_i = vsub_s16 (d_s1_i, d_tmp1); \
856  d_s2_r = vadd_s16 (d_s2_r, d_tmp2); \
857  d_s2_i = vsub_s16 (d_s2_i, d_tmp3); \
858  d_s3_r = vadd_s16 (d_s3_r, d_tmp4); \
859  d_s3_i = vsub_s16 (d_s3_i, d_tmp5);
860 
861 
862 #define RADIX4x4_WITH_TW_LS_02 \
863  d_s4_r = vadd_s16 (d2_in0.val[0], d_s2_r); \
864  d_s4_i = vadd_s16 (d2_in0.val[1], d_s2_i); \
865  d_s5_r = vsub_s16 (d2_in0.val[0], d_s2_r); \
866  d_s5_i = vsub_s16 (d2_in0.val[1], d_s2_i); \
867  d_s6_r = vadd_s16 (d_s1_r, d_s3_r); \
868  d_s6_i = vadd_s16 (d_s1_i, d_s3_i); \
869  d_s7_r = vsub_s16 (d_s1_r, d_s3_r); \
870  d_s7_i = vsub_s16 (d_s1_i, d_s3_i); \
871  d2_out2.val[0] = vsub_s16 (d_s4_r, d_s6_r); \
872  d2_out2.val[1] = vsub_s16 (d_s4_i, d_s6_i); \
873  d2_out0.val[0] = vadd_s16 (d_s4_r, d_s6_r); \
874  d2_out0.val[1] = vadd_s16 (d_s4_i, d_s6_i);
875 
876 #define RADIX4x4_WITH_TW_LS_02_SCALED \
877  d_s4_r = vhadd_s16 (d2_in0.val[0], d_s2_r); \
878  d_s4_i = vhadd_s16 (d2_in0.val[1], d_s2_i); \
879  d_s5_r = vhsub_s16 (d2_in0.val[0], d_s2_r); \
880  d_s5_i = vhsub_s16 (d2_in0.val[1], d_s2_i); \
881  d_s6_r = vhadd_s16 (d_s1_r, d_s3_r); \
882  d_s6_i = vhadd_s16 (d_s1_i, d_s3_i); \
883  d_s7_r = vhsub_s16 (d_s1_r, d_s3_r); \
884  d_s7_i = vhsub_s16 (d_s1_i, d_s3_i); \
885  d2_out2.val[0] = vhsub_s16 (d_s4_r, d_s6_r); \
886  d2_out2.val[1] = vhsub_s16 (d_s4_i, d_s6_i); \
887  d2_out0.val[0] = vhadd_s16 (d_s4_r, d_s6_r); \
888  d2_out0.val[1] = vhadd_s16 (d_s4_i, d_s6_i);
889 
890 
891 static inline void ne10_radix4x4_with_twiddles_forward_unscaled_neon (ne10_fft_cpx_int16_t * Fout,
892  ne10_fft_cpx_int16_t * Fin,
894  ne10_int32_t src_stride,
895  ne10_int32_t dst_stride,
896  ne10_int32_t mstride)
897 {
899 
900  for (m_count = 0; m_count < mstride; m_count += 4)
901  {
902  // load
905 
907 
908  d2_out1.val[0] = vadd_s16 (d_s5_r, d_s7_i);
909  d2_out1.val[1] = vsub_s16 (d_s5_i, d_s7_r);
910  d2_out3.val[0] = vsub_s16 (d_s5_r, d_s7_i);
911  d2_out3.val[1] = vadd_s16 (d_s5_i, d_s7_r);
912 
913  // store
915  }
916 }
917 
918 
919 static inline void ne10_radix4x4_with_twiddles_backward_unscaled_neon (ne10_fft_cpx_int16_t * Fout,
920  ne10_fft_cpx_int16_t * Fin,
922  ne10_int32_t src_stride,
923  ne10_int32_t dst_stride,
924  ne10_int32_t mstride)
925 {
927 
928  for (m_count = 0; m_count < mstride; m_count += 4)
929  {
930  // load
933 
935 
936  d2_out1.val[0] = vsub_s16 (d_s5_r, d_s7_i);
937  d2_out1.val[1] = vadd_s16 (d_s5_i, d_s7_r);
938  d2_out3.val[0] = vadd_s16 (d_s5_r, d_s7_i);
939  d2_out3.val[1] = vsub_s16 (d_s5_i, d_s7_r);
940 
941  // store
943  }
944 }
945 
946 
947 
948 static inline void ne10_radix4x4_with_twiddles_forward_scaled_neon (ne10_fft_cpx_int16_t * Fout,
949  ne10_fft_cpx_int16_t * Fin,
951  ne10_int32_t src_stride,
952  ne10_int32_t dst_stride,
953  ne10_int32_t mstride)
954 {
956 
957  for (m_count = 0; m_count < mstride; m_count += 4)
958  {
959  // load
962 
964 
965  d2_out1.val[0] = vhadd_s16 (d_s5_r, d_s7_i);
966  d2_out1.val[1] = vhsub_s16 (d_s5_i, d_s7_r);
967  d2_out3.val[0] = vhsub_s16 (d_s5_r, d_s7_i);
968  d2_out3.val[1] = vhadd_s16 (d_s5_i, d_s7_r);
969 
970  // store
972  }
973 }
974 
975 static inline void ne10_radix4x4_with_twiddles_backward_scaled_neon (ne10_fft_cpx_int16_t * Fout,
976  ne10_fft_cpx_int16_t * Fin,
978  ne10_int32_t src_stride,
979  ne10_int32_t dst_stride,
980  ne10_int32_t mstride)
981 {
983 
984  for (m_count = 0; m_count < mstride; m_count += 4)
985  {
986  // load
989 
991 
992  d2_out1.val[0] = vhsub_s16 (d_s5_r, d_s7_i);
993  d2_out1.val[1] = vhadd_s16 (d_s5_i, d_s7_r);
994  d2_out3.val[0] = vhadd_s16 (d_s5_r, d_s7_i);
995  d2_out3.val[1] = vhsub_s16 (d_s5_i, d_s7_r);
996 
997  // store
999  }
1000 }
1001 
1002 
1003 #define ne10_mixed_radix_fft_forward_int16_neon(scaled) \
1004 void ne10_mixed_radix_fft_forward_int16_##scaled##_neon (ne10_fft_cpx_int16_t * Fout, \
1005  ne10_fft_cpx_int16_t * Fin, \
1006  ne10_int32_t * factors, \
1007  ne10_fft_cpx_int16_t * twiddles, \
1008  ne10_fft_cpx_int16_t * buffer) \
1009 { \
1010  ne10_int32_t fstride, mstride, N; \
1011  ne10_int32_t fstride1; \
1012  ne10_int32_t f_count; \
1013  ne10_int32_t stage_count; \
1014  \
1015  ne10_fft_cpx_int16_t *Fin1, *Fout1; \
1016  ne10_fft_cpx_int16_t *Fout_ls = Fout; \
1017  ne10_fft_cpx_int16_t *Ftmp; \
1018  ne10_fft_cpx_int16_t *tw, *tw1; \
1019  \
1020  /* init fstride, mstride, N */ \
1021  stage_count = factors[0]; \
1022  fstride = factors[1]; \
1023  mstride = factors[ (stage_count << 1) - 1 ]; \
1024  N = factors[ stage_count << 1 ]; \
1025  \
1026  /* the first stage */ \
1027  Fin1 = Fin; \
1028  Fout1 = Fout; \
1029  if (N == 8) \
1030  { \
1031  N = fstride << 1;\
1032  tw = twiddles; \
1033  ne10_radix8x4_forward_##scaled##_neon (Fout, Fin, fstride);\
1034  \
1035  fstride >>= 2; \
1036  stage_count--; \
1037  \
1038  Ftmp = Fin; \
1039  Fin = Fout; \
1040  Fout = Ftmp; \
1041  } \
1042  else if (N == 4) \
1043  { \
1044  ne10_radix4x4_without_twiddles_forward_##scaled##_neon (Fout, Fin, fstride); \
1045  N = fstride; \
1046  Ftmp = Fin; \
1047  Fin = Fout; \
1048  Fout = Ftmp; \
1049  /* update address for other stages*/ \
1050  stage_count--; \
1051  tw = twiddles; \
1052  fstride >>= 2; \
1053  } \
1054  /* others but the last one*/ \
1055  for (; stage_count > 1 ; stage_count--) \
1056  { \
1057  Fin1 = Fin; \
1058  for (f_count = 0; f_count < fstride; f_count ++) \
1059  { \
1060  Fout1 = & Fout[ f_count * mstride << 2 ]; \
1061  tw1 = tw; \
1062  ne10_radix4x4_with_twiddles_forward_##scaled##_neon (Fout1, Fin1, tw1, N, mstride, mstride); \
1063  Fin1 += mstride; \
1064  } \
1065  tw += mstride * 3; \
1066  mstride <<= 2; \
1067  Ftmp = Fin; \
1068  Fin = Fout; \
1069  Fout = Ftmp; \
1070  fstride >>= 2; \
1071  }\
1072  /* the last one*/ \
1073  if (stage_count) \
1074  { \
1075  Fin1 = Fin; \
1076  Fout1 = Fout_ls; \
1077  for (f_count = 0; f_count < fstride; f_count ++) \
1078  { \
1079  tw1 = tw; \
1080  ne10_radix4x4_with_twiddles_forward_##scaled##_neon (Fout1, Fin1, tw1, N, N, mstride); \
1081  Fin1 += mstride; \
1082  Fout1 += mstride; \
1083  } \
1084  } \
1085 }
1086 
1087 #define ne10_mixed_radix_fft_backward_int16_neon(scaled) \
1088 void ne10_mixed_radix_fft_backward_int16_##scaled##_neon (ne10_fft_cpx_int16_t * Fout, \
1089  ne10_fft_cpx_int16_t * Fin, \
1090  ne10_int32_t * factors, \
1091  ne10_fft_cpx_int16_t * twiddles, \
1092  ne10_fft_cpx_int16_t * buffer) \
1093 { \
1094  ne10_int32_t fstride, mstride, N; \
1095  ne10_int32_t fstride1; \
1096  ne10_int32_t f_count; \
1097  ne10_int32_t stage_count; \
1098  \
1099  ne10_fft_cpx_int16_t *Fin1, *Fout1; \
1100  ne10_fft_cpx_int16_t *Fout_ls = Fout; \
1101  ne10_fft_cpx_int16_t *Ftmp; \
1102  ne10_fft_cpx_int16_t *tw, *tw1; \
1103  \
1104  /* init fstride, mstride, N */ \
1105  stage_count = factors[0]; \
1106  fstride = factors[1]; \
1107  mstride = factors[ (stage_count << 1) - 1 ]; \
1108  N = factors[ stage_count << 1 ]; \
1109  \
1110  /* the first stage */ \
1111  Fin1 = Fin; \
1112  Fout1 = Fout; \
1113  if (N == 8) \
1114  { \
1115  N = fstride << 1;\
1116  tw = twiddles; \
1117  ne10_radix8x4_backward_##scaled##_neon (Fout, Fin, fstride);\
1118  \
1119  fstride >>= 2; \
1120  stage_count--; \
1121  \
1122  Ftmp = Fin; \
1123  Fin = Fout; \
1124  Fout = Ftmp; \
1125  } \
1126  else if (N == 4) \
1127  { \
1128  ne10_radix4x4_without_twiddles_backward_##scaled##_neon (Fout, Fin, fstride); \
1129  N = fstride; \
1130  Ftmp = Fin; \
1131  Fin = Fout; \
1132  Fout = Ftmp; \
1133  /* update address for other stages*/ \
1134  stage_count--; \
1135  tw = twiddles; \
1136  fstride >>= 2; \
1137  } \
1138  /* others but the last one*/ \
1139  for (; stage_count > 1 ; stage_count--) \
1140  { \
1141  Fin1 = Fin; \
1142  for (f_count = 0; f_count < fstride; f_count ++) \
1143  { \
1144  Fout1 = & Fout[ f_count * mstride << 2 ]; \
1145  tw1 = tw; \
1146  ne10_radix4x4_with_twiddles_backward_##scaled##_neon (Fout1, Fin1, tw1, N, mstride, mstride); \
1147  Fin1 += mstride; \
1148  } \
1149  tw += mstride * 3; \
1150  mstride <<= 2; \
1151  Ftmp = Fin; \
1152  Fin = Fout; \
1153  Fout = Ftmp; \
1154  fstride >>= 2; \
1155  }\
1156  /* the last one*/ \
1157  if (stage_count) \
1158  { \
1159  Fin1 = Fin; \
1160  Fout1 = Fout_ls; \
1161  for (f_count = 0; f_count < fstride; f_count ++) \
1162  { \
1163  tw1 = tw; \
1164  ne10_radix4x4_with_twiddles_backward_##scaled##_neon (Fout1, Fin1, tw1, N, N, mstride); \
1165  Fin1 += mstride; \
1166  Fout1 += mstride; \
1167  } \
1168  } \
1169 }
1170 
1171 
1176 
1177 
1178 static void ne10_fft_split_r2c_1d_int16_neon (ne10_fft_cpx_int16_t *dst,
1179  const ne10_fft_cpx_int16_t *src,
1180  ne10_fft_cpx_int16_t *twiddles,
1181  ne10_int32_t ncfft,
1182  ne10_int32_t scaled_flag)
1183 {
1184  ne10_int32_t k;
1185  ne10_int32_t count = ncfft / 2;
1186  ne10_fft_cpx_int16_t fpnk, fpk, f1k, f2k, tw, tdc;
1187  int16x8x2_t q2_fpk, q2_fpnk, q2_tw, q2_dst, q2_dst2;
1188  int16x8_t q_fpnk_r, q_fpnk_i;
1189  int16x8_t q_f1k_r, q_f1k_i, q_f2k_r, q_f2k_i;
1190  int16x8_t q_tw_r, q_tw_i;
1191  int16x8_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
1192  int16x8_t q_dst2_r, q_dst2_i;
1193  int16_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
1194 
1195  tdc.r = src[0].r;
1196  tdc.i = src[0].i;
1197 
1198  if (scaled_flag)
1199  NE10_F2I16_FIXDIV (tdc, 2);
1200 
1201  dst[0].r = tdc.r + tdc.i;
1202  dst[ncfft].r = tdc.r - tdc.i;
1203  dst[ncfft].i = dst[0].i = 0;
1204  if (count >= 8)
1205  {
1206 
1207  if (scaled_flag)
1208  {
1209  for (k = 1; k <= count ; k += 8)
1210  {
1211  p_src = (int16_t*) (& (src[k]));
1212  p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
1213  p_twiddles = (int16_t*) (& (twiddles[k - 1]));
1214  p_dst = (int16_t*) (& (dst[k]));
1215  p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
1216 
1217  q2_fpk = vld2q_s16 (p_src);
1218  q2_fpnk = vld2q_s16 (p_src2);
1219 
1220  q2_tw = vld2q_s16 (p_twiddles);
1221  q2_fpnk.val[0] = vrev32q_s16 (q2_fpnk.val[0]);
1222  q2_fpnk.val[1] = vrev32q_s16 (q2_fpnk.val[1]);
1223  q2_fpnk.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[0])));
1224  q2_fpnk.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[1])));
1225  q_fpnk_r = vcombine_s16 (vget_high_s16 (q2_fpnk.val[0]), vget_low_s16 (q2_fpnk.val[0]));
1226  q_fpnk_i = vcombine_s16 (vget_high_s16 (q2_fpnk.val[1]), vget_low_s16 (q2_fpnk.val[1]));
1227  q_fpnk_i = vnegq_s16 (q_fpnk_i);
1228 
1229  q_f1k_r = vhaddq_s16 (q2_fpk.val[0], q_fpnk_r);
1230  q_f1k_i = vhaddq_s16 (q2_fpk.val[1], q_fpnk_i);
1231 
1232  q_f2k_r = vhsubq_s16 (q2_fpk.val[0], q_fpnk_r);
1233  q_f2k_i = vhsubq_s16 (q2_fpk.val[1], q_fpnk_i);
1234 
1235  q_tmp0 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[0]);
1236  q_tmp1 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[1]);
1237  q_tmp2 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[1]);
1238  q_tmp3 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[0]);
1239  q_tw_r = vsubq_s16 (q_tmp0, q_tmp1);
1240  q_tw_i = vaddq_s16 (q_tmp2, q_tmp3);
1241 
1242  q_dst2_r = vhsubq_s16 (q_f1k_r, q_tw_r);
1243  q_dst2_i = vhsubq_s16 (q_tw_i, q_f1k_i);
1244  q2_dst.val[0] = vhaddq_s16 (q_f1k_r, q_tw_r);
1245  q2_dst.val[1] = vhaddq_s16 (q_f1k_i, q_tw_i);
1246  q_dst2_r = vrev32q_s16 (q_dst2_r);
1247  q_dst2_i = vrev32q_s16 (q_dst2_i);
1248  q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
1249  q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
1250  q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
1251  q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
1252  vst2q_s16 (p_dst, q2_dst);
1253  vst2q_s16 (p_dst2, q2_dst2);
1254 
1255  }
1256  }
1257  else
1258  {
1259  for (k = 1; k <= count ; k += 8)
1260  {
1261  p_src = (int16_t*) (& (src[k]));
1262  p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
1263  p_twiddles = (int16_t*) (& (twiddles[k - 1]));
1264  p_dst = (int16_t*) (& (dst[k]));
1265  p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
1266 
1267  q2_fpk = vld2q_s16 (p_src);
1268  q2_fpnk = vld2q_s16 (p_src2);
1269 
1270  q2_tw = vld2q_s16 (p_twiddles);
1271  q2_fpnk.val[0] = vrev32q_s16 (q2_fpnk.val[0]);
1272  q2_fpnk.val[1] = vrev32q_s16 (q2_fpnk.val[1]);
1273  q2_fpnk.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[0])));
1274  q2_fpnk.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[1])));
1275  q_fpnk_r = vcombine_s16 (vget_high_s16 (q2_fpnk.val[0]), vget_low_s16 (q2_fpnk.val[0]));
1276  q_fpnk_i = vcombine_s16 (vget_high_s16 (q2_fpnk.val[1]), vget_low_s16 (q2_fpnk.val[1]));
1277  q_fpnk_i = vnegq_s16 (q_fpnk_i);
1278 
1279  q_f1k_r = vaddq_s16 (q2_fpk.val[0], q_fpnk_r);
1280  q_f1k_i = vaddq_s16 (q2_fpk.val[1], q_fpnk_i);
1281 
1282  q_f2k_r = vsubq_s16 (q2_fpk.val[0], q_fpnk_r);
1283  q_f2k_i = vsubq_s16 (q2_fpk.val[1], q_fpnk_i);
1284 
1285  q_tmp0 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[0]);
1286  q_tmp1 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[1]);
1287  q_tmp2 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[1]);
1288  q_tmp3 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[0]);
1289  q_tw_r = vsubq_s16 (q_tmp0, q_tmp1);
1290  q_tw_i = vaddq_s16 (q_tmp2, q_tmp3);
1291 
1292  q_dst2_r = vhsubq_s16 (q_f1k_r, q_tw_r);
1293  q_dst2_i = vhsubq_s16 (q_tw_i, q_f1k_i);
1294  q2_dst.val[0] = vhaddq_s16 (q_f1k_r, q_tw_r);
1295  q2_dst.val[1] = vhaddq_s16 (q_f1k_i, q_tw_i);
1296  q_dst2_r = vrev32q_s16 (q_dst2_r);
1297  q_dst2_i = vrev32q_s16 (q_dst2_i);
1298  q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
1299  q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
1300  q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
1301  q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
1302  vst2q_s16 (p_dst, q2_dst);
1303  vst2q_s16 (p_dst2, q2_dst2);
1304 
1305  }
1306  }
1307  }
1308  else
1309  {
1310 
1311  for (k = 1; k <= ncfft / 2 ; ++k)
1312  {
1313  fpk = src[k];
1314  fpnk.r = src[ncfft - k].r;
1315  fpnk.i = - src[ncfft - k].i;
1316  if (scaled_flag)
1317  {
1318  NE10_F2I16_FIXDIV (fpk, 2);
1319  NE10_F2I16_FIXDIV (fpnk, 2);
1320  }
1321 
1322  f1k.r = fpk.r + fpnk.r;
1323  f1k.i = fpk.i + fpnk.i;
1324 
1325  f2k.r = fpk.r - fpnk.r;
1326  f2k.i = fpk.i - fpnk.i;
1327 
1328  tw.r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) f2k.r * (twiddles[k - 1]).r
1329  - (NE10_F2I16_SAMPPROD) f2k.i * (twiddles[k - 1]).i) >> NE10_F2I16_SHIFT);
1330  tw.i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) f2k.r * (twiddles[k - 1]).i
1331  + (NE10_F2I16_SAMPPROD) f2k.i * (twiddles[k - 1]).r) >> NE10_F2I16_SHIFT);
1332 
1333  dst[k].r = (f1k.r + tw.r) >> 1;
1334  dst[k].i = (f1k.i + tw.i) >> 1;
1335  dst[ncfft - k].r = (f1k.r - tw.r) >> 1;
1336  dst[ncfft - k].i = (tw.i - f1k.i) >> 1;
1337  }
1338  }
1339 }
1340 
1341 static void ne10_fft_split_c2r_1d_int16_neon (ne10_fft_cpx_int16_t *dst,
1342  const ne10_fft_cpx_int16_t *src,
1343  ne10_fft_cpx_int16_t *twiddles,
1344  ne10_int32_t ncfft,
1345  ne10_int32_t scaled_flag)
1346 {
1347 
1348  ne10_int32_t k;
1349  ne10_int32_t count = ncfft / 2;
1350  ne10_fft_cpx_int16_t fk, fnkc, fek, fok, tmp;
1351  int16x8x2_t q2_fk, q2_fnkc, q2_tw, q2_dst, q2_dst2;
1352  int16x8_t q_fnkc_r, q_fnkc_i;
1353  int16x8_t q_fek_r, q_fek_i, q_fok_r, q_fok_i;
1354  int16x8_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
1355  int16x8_t q_dst2_r, q_dst2_i;
1356  int16_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
1357 
1358 
1359  dst[0].r = src[0].r + src[ncfft].r;
1360  dst[0].i = src[0].r - src[ncfft].r;
1361 
1362  if (scaled_flag)
1363  NE10_F2I16_FIXDIV (dst[0], 2);
1364  if (count >= 8)
1365  {
1366  if (scaled_flag)
1367  {
1368  for (k = 1; k <= count ; k += 8)
1369  {
1370  p_src = (int16_t*) (& (src[k]));
1371  p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
1372  p_twiddles = (int16_t*) (& (twiddles[k - 1]));
1373  p_dst = (int16_t*) (& (dst[k]));
1374  p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
1375 
1376  q2_fk = vld2q_s16 (p_src);
1377  q2_fnkc = vld2q_s16 (p_src2);
1378  q2_tw = vld2q_s16 (p_twiddles);
1379  q2_fnkc.val[0] = vrev32q_s16 (q2_fnkc.val[0]);
1380  q2_fnkc.val[1] = vrev32q_s16 (q2_fnkc.val[1]);
1381  q2_fnkc.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[0])));
1382  q2_fnkc.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[1])));
1383  q_fnkc_r = vcombine_s16 (vget_high_s16 (q2_fnkc.val[0]), vget_low_s16 (q2_fnkc.val[0]));
1384  q_fnkc_i = vcombine_s16 (vget_high_s16 (q2_fnkc.val[1]), vget_low_s16 (q2_fnkc.val[1]));
1385  q_fnkc_i = vnegq_s16 (q_fnkc_i);
1386 
1387  q_fek_r = vhaddq_s16 (q2_fk.val[0], q_fnkc_r);
1388  q_fek_i = vhaddq_s16 (q2_fk.val[1], q_fnkc_i);
1389  q_tmp0 = vhsubq_s16 (q2_fk.val[0], q_fnkc_r);
1390  q_tmp1 = vhsubq_s16 (q2_fk.val[1], q_fnkc_i);
1391 
1392  q_fok_r = vqdmulhq_s16 (q_tmp0, q2_tw.val[0]);
1393  q_fok_i = vqdmulhq_s16 (q_tmp1, q2_tw.val[0]);
1394  q_tmp2 = vqdmulhq_s16 (q_tmp1, q2_tw.val[1]);
1395  q_tmp3 = vqdmulhq_s16 (q_tmp0, q2_tw.val[1]);
1396  q_fok_r = vaddq_s16 (q_fok_r, q_tmp2);
1397  q_fok_i = vsubq_s16 (q_fok_i, q_tmp3);
1398 
1399  q_dst2_r = vsubq_s16 (q_fek_r, q_fok_r);
1400  q_dst2_i = vsubq_s16 (q_fok_i, q_fek_i);
1401  q2_dst.val[0] = vaddq_s16 (q_fek_r, q_fok_r);
1402  q2_dst.val[1] = vaddq_s16 (q_fek_i, q_fok_i);
1403  q_dst2_r = vrev32q_s16 (q_dst2_r);
1404  q_dst2_i = vrev32q_s16 (q_dst2_i);
1405  q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
1406  q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
1407  q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
1408  q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
1409  vst2q_s16 (p_dst, q2_dst);
1410  vst2q_s16 (p_dst2, q2_dst2);
1411 
1412  }
1413 
1414  }
1415  else
1416  {
1417  for (k = 1; k <= count ; k += 8)
1418  {
1419  p_src = (int16_t*) (& (src[k]));
1420  p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
1421  p_twiddles = (int16_t*) (& (twiddles[k - 1]));
1422  p_dst = (int16_t*) (& (dst[k]));
1423  p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
1424 
1425  q2_fk = vld2q_s16 (p_src);
1426  q2_fnkc = vld2q_s16 (p_src2);
1427  q2_tw = vld2q_s16 (p_twiddles);
1428  q2_fnkc.val[0] = vrev32q_s16 (q2_fnkc.val[0]);
1429  q2_fnkc.val[1] = vrev32q_s16 (q2_fnkc.val[1]);
1430  q2_fnkc.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[0])));
1431  q2_fnkc.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[1])));
1432  q_fnkc_r = vcombine_s16 (vget_high_s16 (q2_fnkc.val[0]), vget_low_s16 (q2_fnkc.val[0]));
1433  q_fnkc_i = vcombine_s16 (vget_high_s16 (q2_fnkc.val[1]), vget_low_s16 (q2_fnkc.val[1]));
1434  q_fnkc_i = vnegq_s16 (q_fnkc_i);
1435 
1436  q_fek_r = vaddq_s16 (q2_fk.val[0], q_fnkc_r);
1437  q_fek_i = vaddq_s16 (q2_fk.val[1], q_fnkc_i);
1438  q_tmp0 = vsubq_s16 (q2_fk.val[0], q_fnkc_r);
1439  q_tmp1 = vsubq_s16 (q2_fk.val[1], q_fnkc_i);
1440 
1441  q_fok_r = vqdmulhq_s16 (q_tmp0, q2_tw.val[0]);
1442  q_fok_i = vqdmulhq_s16 (q_tmp1, q2_tw.val[0]);
1443  q_tmp2 = vqdmulhq_s16 (q_tmp1, q2_tw.val[1]);
1444  q_tmp3 = vqdmulhq_s16 (q_tmp0, q2_tw.val[1]);
1445  q_fok_r = vaddq_s16 (q_fok_r, q_tmp2);
1446  q_fok_i = vsubq_s16 (q_fok_i, q_tmp3);
1447 
1448  q_dst2_r = vsubq_s16 (q_fek_r, q_fok_r);
1449  q_dst2_i = vsubq_s16 (q_fok_i, q_fek_i);
1450  q2_dst.val[0] = vaddq_s16 (q_fek_r, q_fok_r);
1451  q2_dst.val[1] = vaddq_s16 (q_fek_i, q_fok_i);
1452  q_dst2_r = vrev32q_s16 (q_dst2_r);
1453  q_dst2_i = vrev32q_s16 (q_dst2_i);
1454  q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
1455  q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
1456  q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
1457  q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
1458  vst2q_s16 (p_dst, q2_dst);
1459  vst2q_s16 (p_dst2, q2_dst2);
1460 
1461  }
1462  }
1463  }
1464  else
1465  {
1466 
1467  for (k = 1; k <= ncfft / 2; k++)
1468  {
1469  fk = src[k];
1470  fnkc.r = src[ncfft - k].r;
1471  fnkc.i = -src[ncfft - k].i;
1472  if (scaled_flag)
1473  {
1474  NE10_F2I16_FIXDIV (fk, 2);
1475  NE10_F2I16_FIXDIV (fnkc, 2);
1476  }
1477 
1478  fek.r = fk.r + fnkc.r;
1479  fek.i = fk.i + fnkc.i;
1480 
1481  tmp.r = fk.r - fnkc.r;
1482  tmp.i = fk.i - fnkc.i;
1483 
1484  fok.r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) tmp.r * (twiddles[k - 1]).r
1485  + (NE10_F2I16_SAMPPROD) tmp.i * (twiddles[k - 1]).i) >> NE10_F2I16_SHIFT);
1486  fok.i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) tmp.i * (twiddles[k - 1]).r
1487  - (NE10_F2I16_SAMPPROD) tmp.r * (twiddles[k - 1]).i) >> NE10_F2I16_SHIFT);
1488 
1489  dst[k].r = fek.r + fok.r;
1490  dst[k].i = fek.i + fok.i;
1491 
1492  dst[ncfft - k].r = fek.r - fok.r;
1493  dst[ncfft - k].i = fok.i - fek.i;
1494  }
1495  }
1496 }
1497 
1503  ne10_fft_cpx_int16_t *fin,
1505  ne10_int32_t inverse_fft,
1506  ne10_int32_t scaled_flag)
1507 {
1508  if (scaled_flag)
1509  {
1510  if (inverse_fft)
1511  {
1512  switch (cfg->nfft)
1513  {
1514  case 1:
1515  fout[0] = fin[0];
1516  break;
1517  case 2:
1518  ne10_fft2_backward_int16_scaled (fout, fin);
1519  break;
1520  case 4:
1521  ne10_fft4_backward_int16_scaled (fout, fin);
1522  break;
1523  case 8:
1524  ne10_fft8_backward_int16_scaled (fout, fin);
1525  break;
1526  default:
1528  break;
1529  }
1530  }
1531  else
1532  {
1533  switch (cfg->nfft)
1534  {
1535  case 1:
1536  fout[0] = fin[0];
1537  break;
1538  case 2:
1539  ne10_fft2_forward_int16_scaled (fout, fin);
1540  break;
1541  case 4:
1542  ne10_fft4_forward_int16_scaled (fout, fin);
1543  break;
1544  case 8:
1545  ne10_fft8_forward_int16_scaled (fout, fin);
1546  break;
1547  default:
1549  break;
1550  }
1551  }
1552  }
1553  else
1554  {
1555  if (inverse_fft)
1556  {
1557  switch (cfg->nfft)
1558  {
1559  case 1:
1560  fout[0] = fin[0];
1561  break;
1562  case 2:
1563  ne10_fft2_backward_int16_unscaled (fout, fin);
1564  break;
1565  case 4:
1566  ne10_fft4_backward_int16_unscaled (fout, fin);
1567  break;
1568  case 8:
1569  ne10_fft8_backward_int16_unscaled (fout, fin);
1570  break;
1571  default:
1573  break;
1574  }
1575  }
1576  else
1577  {
1578  switch (cfg->nfft)
1579  {
1580  case 1:
1581  fout[0] = fin[0];
1582  break;
1583  case 2:
1584  ne10_fft2_forward_int16_unscaled (fout, fin);
1585  break;
1586  case 4:
1587  ne10_fft4_forward_int16_unscaled (fout, fin);
1588  break;
1589  case 8:
1590  ne10_fft8_forward_int16_unscaled (fout, fin);
1591  break;
1592  default:
1594  break;
1595  }
1596  }
1597  }
1598 }
1599 
1605  ne10_int16_t *fin,
1607  ne10_int32_t scaled_flag)
1608 {
1609  ne10_fft_cpx_int16_t * tmpbuf1 = cfg->buffer;
1610  ne10_fft_cpx_int16_t * tmpbuf2 = cfg->buffer + cfg->ncfft;
1611  ne10_fft_state_int16_t c2c_state;
1612 
1613  c2c_state.nfft = cfg->ncfft;
1614  c2c_state.factors = cfg->factors;
1615  c2c_state.twiddles = cfg->twiddles;
1616  c2c_state.buffer = tmpbuf2;
1617 
1618  ne10_fft_c2c_1d_int16_neon (tmpbuf1, (ne10_fft_cpx_int16_t*) fin, &c2c_state, 0, scaled_flag);
1619  ne10_fft_split_r2c_1d_int16_neon (fout, tmpbuf1, cfg->super_twiddles, cfg->ncfft, scaled_flag);
1620 }
1621 
1627  ne10_fft_cpx_int16_t *fin,
1629  ne10_int32_t scaled_flag)
1630 {
1631  ne10_fft_cpx_int16_t * tmpbuf1 = cfg->buffer;
1632  ne10_fft_cpx_int16_t * tmpbuf2 = cfg->buffer + cfg->ncfft;
1633  ne10_fft_state_int16_t c2c_state;
1634 
1635  c2c_state.nfft = cfg->ncfft;
1636  c2c_state.factors = cfg->factors;
1637  c2c_state.twiddles = cfg->twiddles;
1638  c2c_state.buffer = tmpbuf2;
1639 
1640  ne10_fft_split_c2r_1d_int16_neon (tmpbuf1, fin, cfg->super_twiddles, cfg->ncfft, scaled_flag);
1641  ne10_fft_c2c_1d_int16_neon ( (ne10_fft_cpx_int16_t*) fout, tmpbuf1, &c2c_state, 1, scaled_flag);
1642 }
void ne10_mixed_radix_fft_backward_int16_scaled_neon(ne10_fft_cpx_int16_t *Fout, ne10_fft_cpx_int16_t *fin, ne10_int32_t *factors, ne10_fft_cpx_int16_t *twiddles, ne10_fft_cpx_int16_t *buffer) asm("ne10_mixed_radix_fft_backward_int16_scaled_neon")
#define RADIX4x4_WITHOUT_TW_START
Structure for the 16-bit fixed point FFT function.
Definition: NE10_types.h:294
ne10_fft_cpx_int16_t * twiddles
Definition: NE10_types.h:315
#define RADIX8x4_LS_02
ne10_int16_t r
Definition: NE10_types.h:296
ne10_int32_t * factors
Definition: NE10_types.h:314
#define RADIX4x4_WITH_TW_LOAD
#define NE10_F2I16_SHIFT
Definition: NE10_macros.h:72
void ne10_mixed_radix_fft_forward_int16_scaled_neon(ne10_fft_cpx_int16_t *Fout, ne10_fft_cpx_int16_t *fin, ne10_int32_t *factors, ne10_fft_cpx_int16_t *twiddles, ne10_fft_cpx_int16_t *buffer) asm("ne10_mixed_radix_fft_forward_int16_scaled_neon")
ne10_fft_cpx_int16_t * twiddles
Definition: NE10_types.h:304
int32_t ne10_int32_t
Definition: NE10_types.h:76
#define RADIX4x4_WITH_TW_START
#define RADIX4x4_WITHOUT_TW_S0
#define ne10_mixed_radix_fft_backward_int16_neon(scaled)
void ne10_fft_c2r_1d_int16_neon(ne10_int16_t *fout, ne10_fft_cpx_int16_t *fin, ne10_fft_r2c_cfg_int16_t cfg, ne10_int32_t scaled_flag)
Specific implementation of ne10_fft_c2r_1d_int16 using NEON SIMD capabilities.
#define RADIX8x4_FWD_S357
#define RADIX8x4_START
void ne10_fft_r2c_1d_int16_neon(ne10_fft_cpx_int16_t *fout, ne10_int16_t *fin, ne10_fft_r2c_cfg_int16_t cfg, ne10_int32_t scaled_flag)
Specific implementation of ne10_fft_r2c_1d_int16 using NEON SIMD capabilities.
#define RADIX4x4_WITH_TW_LS_02
#define NE10_F2I16_SAMPPROD
Definition: NE10_macros.h:73
#define RADIX8x4_FS_S0_SCALED
ne10_fft_cpx_int16_t * super_twiddles
Definition: NE10_types.h:316
#define RADIX8x4_INV_S357
ne10_int16_t i
Definition: NE10_types.h:297
void ne10_mixed_radix_fft_forward_int16_unscaled_neon(ne10_fft_cpx_int16_t *Fout, ne10_fft_cpx_int16_t *fin, ne10_int32_t *factors, ne10_fft_cpx_int16_t *twiddles, ne10_fft_cpx_int16_t *buffer) asm("ne10_mixed_radix_fft_forward_int16_unscaled_neon")
#define RADIX4x4_WITH_TW_STORE
#define FFT4_FWD_LS
void ne10_mixed_radix_fft_backward_int16_unscaled_neon(ne10_fft_cpx_int16_t *Fout, ne10_fft_cpx_int16_t *fin, ne10_int32_t *factors, ne10_fft_cpx_int16_t *twiddles, ne10_fft_cpx_int16_t *buffer) asm("ne10_mixed_radix_fft_backward_int16_unscaled_neon")
#define FFT8_INV_LS
#define RADIX8x4_FS_S0
#define RADIX4x4_WITH_TW_LS_02_SCALED
ne10_int32_t * factors
Definition: NE10_types.h:303
#define RADIX4x4_WITHOUT_TW_LOAD
#define NE10_F2I16_FIXDIV(c, div)
Definition: NE10_macros.h:77
#define RADIX4x4_WITHOUT_TW_S0_SCALED
#define FFT8_FS_START
#define FFT8_FS_SCALED
#define RADIX8x4_LS_02_SCALED
#define FFT4_FS_START
ne10_fft_cpx_int16_t * buffer
Definition: NE10_types.h:305
#define RADIX8x4_LOAD
#define RADIX4x4_WITH_TW_S1_INV
#define RADIX4x4_WITH_TW_S1_FWD
#define FFT4_FS_SCALED
#define ne10_mixed_radix_fft_forward_int16_neon(scaled)
#define FFT4_INV_LS
#define FFT8_FWD_LS
#define FFT4_FS
#define RADIX4x4_WITHOUT_TW_STORE
int16_t ne10_int16_t
Definition: NE10_types.h:74
ne10_fft_cpx_int16_t * buffer
Definition: NE10_types.h:317
void ne10_fft_c2c_1d_int16_neon(ne10_fft_cpx_int16_t *fout, ne10_fft_cpx_int16_t *fin, ne10_fft_cfg_int16_t cfg, ne10_int32_t inverse_fft, ne10_int32_t scaled_flag)
Specific implementation of ne10_fft_c2c_1d_int16 using NEON SIMD capabilities.
#define RADIX8x4_STORE