Project Ne10
An open, optimized software library for the ARM architecture.
factor.h
Go to the documentation of this file.
1 /*
2  * Copyright 2011-16 ARM Limited and Contributors.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of ARM Limited nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /*
29  * NE10 Library : common/factor.h
30  */
31 
32 // Typebuilding macros (slight difference between toolchain versions on intrinsics)
33 #define FLOAT32_2x3(x1,y1,x2,y2,x3,y3) \
34  {{ \
35  {x1, y1}, {x2,y2}, {x3,y3} \
36  }}
37 
38 // There are several categories of functions that share common code. Different groups of
39 // functions take different number of inputs.
40 //
41 // Group 1 = Functions that take a dst, a src, and a cst ("DstSrcCst" for short)
42 // Group 2 = Those that take a dst, an acc, a src, and a cst ("DstAccSrcCst" for short)
43 // Group 3 = The ones that take a dst, and a cst only ("DstCst" for short)
44 //
45 // Group 4 = These take a dst, and two src inputs, src2 and scr2 ("DstSrc1Src2")
46 // Group 5 = These take a dst, an acc, and two src inputs ("DstAccSrc1Src2")
47 
48 // A few macros to check pointers and their address range to make sure there's
49 // no unwanted overlap between any two of them
50 #define NE10_CHECKPOINTER_DstSrcCst \
51  if ( (void *)dst < (void *)src ) \
52  { assert ( (void *)dst + count <= (void *)src ); } \
53  else if ( (void *)dst > (void *)src ) \
54  { assert ( (void *)src + count <= (void *)dst ); }
55 
56 #define NE10_CHECKPOINTER_DstSrc NE10_CHECKPOINTER_DstSrcCst
57 
58 #define NE10_CHECKPOINTER_3POINTER(arg1, arg2, arg3) \
59  if ( (void *)arg1 < (void *)arg2 ) \
60  { assert ( (void *)arg1 + count <= (void *)arg2 ); } \
61  else if ( (void *)arg1 > (void *)arg2 ) \
62  { assert ( (void *)arg2 + count <= (void *)arg1 ); } \
63  if ( (void *)arg1 < (void *)arg3 ) \
64  { assert ( (void *)arg1 + count <= (void *)arg3 ); } \
65  else if ( (void *)arg1 > (void *)arg3 ) \
66  { assert ( (void *)arg3 + count <= (void *)arg1 ); } \
67  if ( (void *)arg3 < (void *)arg2 ) \
68  { assert ( (void *)arg3 + count <= (void *)arg2 ); } \
69  else if ( (void *)arg3 > (void *)arg2 ) \
70  { assert ( (void *)arg2 + count <= (void *)arg3 ); }
71 
72 #define NE10_CHECKPOINTER_4POINTER(arg1, arg2, arg3, arg4) \
73  NE10_CHECKPOINTER_3POINTER(arg1, arg2, arg3) \
74  if ( (void *)arg1 < (void *)arg4 ) \
75  { assert ( (void *)arg1 + count <= (void *)arg4 ); } \
76  else if ( (void *)arg1 > (void *)arg4 ) \
77  { assert ( (void *)arg4 + count <= (void *)arg1 ); } \
78  if ( (void *)arg2 < (void *)arg4 ) \
79  { assert ( (void *)arg2 + count <= (void *)arg4 ); } \
80  else if ( (void *)arg2 > (void *)arg4 ) \
81  { assert ( (void *)arg4 + count <= (void *)arg2 ); } \
82  if ( (void *)arg4 < (void *)arg3 ) \
83  { assert ( (void *)arg4 + count <= (void *)arg3 ); } \
84  else if ( (void *)arg4 > (void *)arg3 ) \
85  { assert ( (void *)arg3 + count <= (void *)arg4 ); }
86 
87 
88 
89 #define NE10_CHECKPOINTER_DstAccSrcCst { \
90  NE10_CHECKPOINTER_3POINTER(dst, acc, src); }
91 
92 #define NE10_CHECKPOINTER_DstCst {}
93 
94 #define NE10_CHECKPOINTER_DstSrc1Src2 { \
95  NE10_CHECKPOINTER_3POINTER(dst, src1, src2); }
96 
97 #define NE10_CHECKPOINTER_DstAccSrc1Src2 { \
98  NE10_CHECKPOINTER_4POINTER(dst, acc, src1, src2); }
99 
100 
101 // Main Loop = The loop where the number of items to be processed is exactly the
102 // number that we can process in a single iteration.
103 //
104 // Secondary Loop = The loop that follows a Main Loop to fill in the entries that
105 // did not fit into the Main Loop. This is needed when the number of
106 // input items is not a multiple of the number of items that we
107 // process in every iteration of the Main Loop.
108 
109 
110 /****************************************************
111  * *
112  * The "DstSrcCst" group of functions *
113  * *
114  ****************************************************/
115 
117 
118 #define NE10_DstSrcCst_MAINLOOP_FLOAT_NEON(loopCode) { \
119  /* load 4 values */ \
120  n_src = vld1q_f32( (float32_t*)src ); \
121  src += 4; /* move to the next 4 float items; 4*float */ \
122  loopCode; /* the actual operation is placed here... */ /* The main loop iterates through four float values each time */ \
123  vst1q_f32 ( (float32_t*)dst , n_dst ); /* store the results back */ \
124  dst += 4; /* move to the next items; 4*float */ \
125  }
126 
127 #define NE10_DstSrcCst_SECONDLOOP_FLOAT_NEON(loopCode) { \
128  float32x2_t n_rest = { 0.0f , 0.0f }; /* temporary storage to be used with NEON load/store intrinsics */ \
129  float32x2_t n_rest_cst = { cst, cst }; /* temporary constant value for use in the main NEON operation */ \
130  n_rest = vld1_lane_f32 ( (float32_t*)src, n_rest, 0); /* load into the first lane of d0 */ \
131  loopCode; /* the actual operation is placed here ... */ /* exceptional cases where the count is not a multiple of 4 */ \
132  vst1_lane_f32( (float32_t*)dst, n_rest, 0); /* store the lane back into the memory */ \
133  /* move to the next item in the stream */ \
134  src++; \
135  dst++; \
136  }
137 
138 #define NE10_DstSrcCst_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
139  ne10_result_t res = NE10_OK; \
140  float32x4_t n_src; \
141  float32x4_t n_dst; \
142  int dif = 0; \
143  dif = count % 4; /* either 0 or one of 1,2,3; in the latter cases the second path is taken */ \
144  for (; count > dif; count -= 4) { \
145  loopCode1; \
146  } \
147  if ( 0 != dif ) { \
148  unsigned int idx; \
149  for ( idx = 0 ; idx < dif; idx++ ) { \
150  loopCode2; \
151  } \
152  } \
153  return res; \
154  }
155 
157 
158 #define NE10_DstSrcCst_MAINLOOP_VEC2F_NEON(loopCode) { \
159  n_src = vld1q_f32( (float32_t*)src ); /* load two vectors */ \
160  src += 2; /* move to the next two vectors */ \
161  loopCode; /* actual operation */ /* The main loop iterates through two 2D vectors each time */ \
162  vst1q_f32 ( (float32_t*)dst , n_dst ); /* store back */ \
163  dst += 2; /* move to the next 2 vectors */ \
164  }
165 
166 #define NE10_DstSrcCst_SECONDLOOP_VEC2F_NEON(loopCode) { \
167  float32x2_t n_rest; \
168  float32x2_t n_rest_cst = { cst->x, cst->y }; \
169  n_rest = vld1_f32( (float32_t*)src ); \
170  loopCode; /* exceptional cases where the count isn't a multiple of 2 */ \
171  vst1_f32( (float32_t*)dst, n_rest); \
172  }
173 
174 #define NE10_DstSrcCst_OPERATION_VEC2F_NEON(loopCode1, loopCode2) { \
175  ne10_result_t res = NE10_OK; \
176  float32x4_t n_cst = { cst->x, cst->y, cst->x, cst->y }; \
177  float32x4_t n_src; \
178  float32x4_t n_dst; \
179  int dif = count % 2; \
180  for (; count > dif; count -= 2) { \
181  loopCode1; \
182  } \
183  if ( 0 != dif ) { \
184  loopCode2; \
185  } \
186  return res; \
187  }
188 
190 
191 #define NE10_DstSrcCst_MAINLOOP_VEC3F_NEON(loopCode) { \
192  n_src1 = vld1q_f32( (float32_t*)src ); \
193  src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
194  n_src2 = vld1q_f32( (float32_t*)src ); \
195  src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
196  n_src3 = vld1q_f32( (float32_t*)src ); \
197  src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
198  loopCode; /* The main loop iterates through three 3D vectors each time */ \
199  vst1q_f32 ( (float32_t*)dst , n_dst1 ); \
200  dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
201  vst1q_f32 ( (float32_t*)dst , n_dst2 ); \
202  dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
203  vst1q_f32 ( (float32_t*)dst , n_dst3 ); \
204  dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
205  }
206 
207 #define NE10_DstSrcCst_SECONDLOOP_VEC3F_NEON(loopCode) { \
208  float32x2x3_t n_rest = FLOAT32_2x3( \
209  0.0f, 0.0f, 0.0f , 0.0f, 0.0f , 0.0f); \
210  float32x2x3_t n_rest_cst = { (const float32x2_t){cst->x, 0}, \
211  (const float32x2_t){cst->y, 0}, (const float32x2_t){cst->z, 0} }; \
212  n_rest = vld3_lane_f32 ( (float32_t*)src, n_rest, 0); \
213  loopCode; /* exceptional cases where the count isn't a multiple of 3 */ \
214  vst3_lane_f32( (float32_t*)dst, n_rest, 0); \
215  src++; \
216  dst++; \
217  }
218 
219 #define NE10_DstSrcCst_OPERATION_VEC3F_NEON(loopCode1, loopCode2) { \
220  ne10_result_t res = NE10_OK; \
221  float32x4_t n_cst1 = { cst->x, cst->y, cst->z, cst->x }; \
222  float32x4_t n_cst2 = { cst->y, cst->z, cst->x, cst->y }; \
223  float32x4_t n_cst3 = { cst->z, cst->x, cst->y, cst->z }; \
224  float32x4_t n_src1, n_src2, n_src3; \
225  float32x4_t n_dst1, n_dst2, n_dst3; \
226  int dif = count % 4; \
227  for (; count > dif; count -= 4) { \
228  loopCode1; \
229  } \
230  if ( 0 != dif ) { \
231  unsigned int idx; \
232  for ( idx = 0 ; idx < dif; idx++ ) { \
233  loopCode2; \
234  } \
235  } \
236  return res; \
237  }
238 
240 
241 /* Note that for the VEC4* types, we do not need a second loop as the number
242  of input items is always a multiple of four. */
243 
244 #define NE10_DstSrcCst_MAINLOOP_VEC4F_NEON(loopCode) { \
245  n_src = vld1q_f32( (float32_t*)src ); \
246  src ++; \
247  loopCode; \
248  vst1q_f32 ( (float32_t*)dst , n_dst ); /* The main loop iterates through one 4D vector each time */ \
249  dst ++; \
250  }
251 
252 #define NE10_DstSrcCst_OPERATION_VEC4F_NEON(loopCode) { \
253  ne10_result_t res = NE10_OK; \
254  float32x4_t n_cst = { cst->x, cst->y, cst->z, cst->w }; \
255  float32x4_t n_src; \
256  float32x4_t n_dst; \
257  for (; count != 0; count --) { \
258  loopCode; \
259  } \
260  return res; \
261  }
262 
263 /****************************************************
264  * *
265  * The "DstAccSrcCst" group of functions *
266  * *
267  ****************************************************/
268 
270 
271 #define NE10_DstAccSrcCst_MAINLOOP_FLOAT_NEON(loopCode) { \
272  /* load 4 values */ \
273  n_acc = vld1q_f32( (float32_t*)acc ); \
274  n_src = vld1q_f32( (float32_t*)src ); \
275  acc += 4; /* move to the next 4 float items; 4*float */ \
276  src += 4; \
277  loopCode; /* the actual operation is placed here... */ /* The main loop iterates through four float values each time */ \
278  vst1q_f32 ( (float32_t*)dst , n_dst ); /* store theresults back */ \
279  dst += 4; /* move to the next items; 4*float */ \
280  }
281 
282 #define NE10_DstAccSrcCst_SECONDLOOP_FLOAT_NEON(loopCode) { \
283  float32x2_t n_rest_acc = { 0.0f , 0.0f }; /* temporary storage to be used with NEON load/store intrinsics */ \
284  float32x2_t n_rest = { 0.0f , 0.0f }; /* temporary storage to be used with NEON load/store intrinsics */ \
285  float32x2_t n_rest_cst = { cst, cst }; /* temporary constant value for use in the main NEON operation */ \
286  n_rest_acc = vld1_lane_f32 ( (float32_t*)acc, n_rest_acc, 0); /* load into the first lane of d0 */ \
287  n_rest = vld1_lane_f32 ( (float32_t*)src, n_rest, 0); /* load into the first lane of d1 */ \
288  loopCode; /* the actual operation is palced here ... */ /* exceptional cases where the count is not a multiple of 4 */ \
289  vst1_lane_f32( (float32_t*)dst, n_rest, 0); /* store the lane back into the memory */ \
290  /* move to the next item in the stream */ \
291  acc++; \
292  src++; \
293  dst++; \
294  }
295 
296 #define NE10_DstAccSrcCst_OPERATION_FLOAT_NEON NE10_DstSrcCst_OPERATION_FLOAT_NEON
297 
299 
300 #define NE10_DstAccSrcCst_MAINLOOP_VEC2F_NEON(loopCode) { \
301  n_acc = vld1q_f32( (float32_t*)acc ); /* load two vectors */ \
302  n_src = vld1q_f32( (float32_t*)src ); /* load two vectors */ \
303  acc += 2; /* move to the next two vectors */ \
304  src += 2; \
305  loopCode; /* actual operation */ /* The main loop iterates through two 2D vectors each time */ \
306  vst1q_f32 ( (float32_t*)dst , n_dst ); /* store back */ \
307  dst += 2; /* move to the next 2 vectors */ \
308  }
309 
310 #define NE10_DstAccSrcCst_SECONDLOOP_VEC2F_NEON(loopCode) { \
311  float32x2_t n_rest_acc; \
312  float32x2_t n_rest; \
313  float32x2_t n_rest_cst = { cst->x, cst->y }; \
314  n_rest_acc = vld1_f32( (float32_t*)acc ); \
315  n_rest = vld1_f32( (float32_t*)src ); \
316  loopCode; /* exceptional cases where the count isn't a multiple of 2 */ \
317  vst1_f32( (float32_t*)dst, n_rest); \
318  }
319 
320 #define NE10_DstAccSrcCst_OPERATION_VEC2F_NEON NE10_DstSrcCst_OPERATION_VEC2F_NEON
321 
323 
324 #define NE10_DstAccSrcCst_MAINLOOP_VEC3F_NEON(loopCode) { \
325  n_acc1 = vld1q_f32( (float32_t*)acc ); /* Load accumulator values */ \
326  acc = ((void*)acc)+(4*sizeof(ne10_float32_t)); \
327  n_acc2 = vld1q_f32( (float32_t*)acc ); \
328  acc = ((void*)acc)+(4*sizeof(ne10_float32_t)); \
329  n_acc3 = vld1q_f32( (float32_t*)acc ); \
330  acc = ((void*)acc)+(4*sizeof(ne10_float32_t)); \
331  n_src1 = vld1q_f32( (float32_t*)src ); /* Load source values */ \
332  src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
333  n_src2 = vld1q_f32( (float32_t*)src ); \
334  src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
335  n_src3 = vld1q_f32( (float32_t*)src ); \
336  src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
337  loopCode; /* The main loop iterates through three 3D vectors each time */ \
338  vst1q_f32 ( (float32_t*)dst , n_dst1 ); /* Store the results back into the memory */ \
339  dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
340  vst1q_f32 ( (float32_t*)dst , n_dst2 ); \
341  dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
342  vst1q_f32 ( (float32_t*)dst , n_dst3 ); \
343  dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
344  }
345 
346 #define NE10_DstAccSrcCst_SECONDLOOP_VEC3F_NEON(loopCode) { \
347  float32x2x3_t n_rest_acc = FLOAT32_2x3( \
348  0.0f, 0.0f, \
349  0.0f, 0.0f, \
350  0.0f, 0.0f \
351  ); \
352  float32x2x3_t n_rest = FLOAT32_2x3( \
353  0.0f, 0.0f, \
354  0.0f, 0.0f, \
355  0.0f, 0.0f \
356  ); \
357  float32x2x3_t n_rest_cst = { (const float32x2_t){cst->x, 0}, \
358  (const float32x2_t){cst->y, 0}, \
359  (const float32x2_t){cst->z, 0} }; \
360  n_rest_acc = vld3_lane_f32 ( (float32_t*)acc, n_rest_acc, 0); \
361  n_rest = vld3_lane_f32 ( (float32_t*)src, n_rest, 0); \
362  loopCode; /* exceptional cases where the count isn't a multiple of 3 */ \
363  vst3_lane_f32( (float32_t*)dst, n_rest, 0); \
364  acc++; \
365  src++; \
366  dst++; \
367  }
368 
369 #define NE10_DstAccSrcCst_OPERATION_VEC3F_NEON NE10_DstSrcCst_OPERATION_VEC3F_NEON
370 
372 
373 #define NE10_DstAccSrcCst_MAINLOOP_VEC4F_NEON(loopCode) { \
374  n_acc = vld1q_f32( (float32_t*)acc ); \
375  n_src = vld1q_f32( (float32_t*)src ); \
376  acc ++; \
377  src ++; \
378  loopCode; \
379  vst1q_f32 ( (float32_t*)dst , n_dst ); /* The main loop iterates through one 4D vector each time */ \
380  dst ++; \
381  }
382 
383 #define NE10_DstAccSrcCst_OPERATION_VEC4F_NEON NE10_DstSrcCst_OPERATION_VEC4F_NEON
384 
385 /****************************************************
386  * *
387  * The "DstCst" group of functions *
388  * *
389  ****************************************************/
390 
392 
393 #define NE10_DstCst_MAINLOOP_FLOAT_NEON(loopCode) { \
394  /* load 4 values */ \
395  loopCode; /* the actual operation is placed here... */ /* The main loop iterates through four float values each time */ \
396  vst1q_f32 ( (float32_t*)dst , n_cst ); /* store theresults back */ \
397  dst += 4; /* move to the next items; 4*float */ \
398  }
399 
400 #define NE10_DstCst_SECONDLOOP_FLOAT_NEON(loopCode) { \
401  float32x2_t n_rest_cst = { cst, cst }; /* temporary constant value for use in the main NEON operation */ \
402  loopCode; /* the actual operation is palced here ... */ /* exceptional cases where the count is not a multiple of 4 */ \
403  vst1_lane_f32( (float32_t*)dst, n_rest_cst, 0); /* store the lane back into the memory */ \
404  /* move to the next item in the stream */ \
405  dst++; \
406  }
407 
408 #define NE10_DstCst_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
409  ne10_result_t res = NE10_OK; \
410  int dif = 0; \
411  dif = count % 4; /* either 0 or one of 1,2,3; in the latter cases the second path is taken */ \
412  for (; count > dif; count -= 4) { \
413  loopCode1; \
414  } \
415  if ( 0 != dif ) { \
416  unsigned int idx; \
417  for ( idx = 0 ; idx < dif; idx++ ) { \
418  loopCode2; \
419  } \
420  } \
421  return res; \
422  }
423 
425 
426 
427 #define NE10_DstCst_MAINLOOP_VEC2F_NEON(loopCode) { \
428  loopCode; /* actual operation */ /* The main loop iterates through two 2D vectors each time */ \
429  vst1q_f32 ( (float32_t*)dst , n_cst ); /* store back */ \
430  dst += 2; /* move to the next 2 vectors */ \
431  }
432 
433 #define NE10_DstCst_SECONDLOOP_VEC2F_NEON(loopCode) { \
434  float32x2_t n_rest_cst = { cst->x, cst->y }; \
435  loopCode; /* exceptional cases where the count isn't a multiple of 2 */ \
436  vst1_f32( (float32_t*)dst, n_rest_cst); \
437  }
438 
439 #define NE10_DstCst_OPERATION_VEC2F_NEON(loopCode1, loopCode2) { \
440  ne10_result_t res = NE10_OK; \
441  float32x4_t n_cst = { cst->x, cst->y, cst->x, cst->y }; \
442  int dif = count % 2; \
443  for (; count > dif; count -= 2) { \
444  loopCode1; \
445  } \
446  if ( 0 != dif ) { \
447  loopCode2; \
448  } \
449  return res; \
450  }
451 
453 
454 #define NE10_DstCst_MAINLOOP_VEC3F_NEON(loopCode) { \
455  loopCode; /* The main loop iterates through three 3D vectors each time */ \
456  vst1q_f32 ( (float32_t*)dst , n_cst1 ); \
457  dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
458  vst1q_f32 ( (float32_t*)dst , n_cst2 ); \
459  dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
460  vst1q_f32 ( (float32_t*)dst , n_cst3 ); \
461  dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
462  }
463 
464 #define NE10_DstCst_SECONDLOOP_VEC3F_NEON(loopCode) { \
465  float32x2x3_t n_rest_cst = { (const float32x2_t){cst->x, 0}, \
466  (const float32x2_t){cst->y, 0}, (const float32x2_t){cst->z, 0} }; \
467  loopCode; /* exceptional cases where the count isn't a multiple of 3 */ \
468  vst3_lane_f32( (float32_t*)dst, n_rest_cst, 0); \
469  dst++; \
470  }
471 
472 #define NE10_DstCst_OPERATION_VEC3F_NEON(loopCode1, loopCode2) { \
473  ne10_result_t res = NE10_OK; \
474  float32x4_t n_cst1 = { cst->x, cst->y, cst->z, cst->x }; \
475  float32x4_t n_cst2 = { cst->y, cst->z, cst->x, cst->y }; \
476  float32x4_t n_cst3 = { cst->z, cst->x, cst->y, cst->z }; \
477  int dif = count % 4; \
478  for (; count > dif; count -= 4) { \
479  loopCode1; \
480  } \
481  if ( 0 != dif ) { \
482  unsigned int idx; \
483  for ( idx = 0 ; idx < dif; idx++ ) { \
484  loopCode2; \
485  } \
486  } \
487  return res; \
488  }
489 
491 
492 #define NE10_DstCst_MAINLOOP_VEC4F_NEON(loopCode) { \
493  loopCode; \
494  vst1q_f32 ( (float32_t*)dst , n_cst ); /* The main loop iterates through one 4D vector each time */ \
495  dst ++; \
496  }
497 
498 #define NE10_DstCst_OPERATION_VEC4F_NEON(loopCode) { \
499  ne10_result_t res = NE10_OK; \
500  float32x4_t n_cst = { cst->x, cst->y, cst->z, cst->w }; \
501  for (; count != 0; count --) { \
502  loopCode; \
503  } \
504  return res; \
505  }
506 
507 /****************************************************
508  * *
509  * The "DstSrc1Src2" group of functions *
510  * *
511  ****************************************************/
512 
514 
515 #define NE10_DstSrc1Src2_MAINLOOP_FLOAT_NEON(loopCode) { \
516  /* load 4 values */ \
517  n_src = vld1q_f32( (float32_t*)src1 ); \
518  src1 += 4; /* move to the next 4 float items; 4*float */ \
519  n_src2 = vld1q_f32( (float32_t*)src2 ); \
520  src2 += 4; /* move to the next 4 float items; 4*float */ \
521  loopCode; /* the actual operation is placed here... */ /* The main loop iterates through four float values each time */ \
522  vst1q_f32 ( (float32_t*)dst , n_dst ); /* store the results back */ \
523  dst += 4; /* move to the next items; 4*float */ \
524  }
525 
526 #define NE10_DstSrc1Src2_SECONDLOOP_FLOAT_NEON(loopCode) { \
527  float32x2_t n_rest = { 0.0f , 0.0f }; /* temporary storage to be used with NEON load/store intrinsics */ \
528  float32x2_t n_rest2 = { 0.0f , 0.0f }; \
529  n_rest = vld1_lane_f32 ( (float32_t*)src1, n_rest, 0); /* load into the first lane of d0 */ \
530  n_rest2 = vld1_lane_f32 ( (float32_t*)src2, n_rest, 0); \
531  loopCode; /* the actual operation is placed here ... */ /* exceptional cases where the count is not a multiple of 4 */ \
532  vst1_lane_f32( (float32_t*)dst, n_rest, 0); /* store the lane back into the memory */ \
533  /* move to the next item in the stream */ \
534  src1++; \
535  src2++; \
536  dst++; \
537  }
538 
539 #define NE10_DstSrc1Src2_OPERATION_FLOAT_NEON NE10_DstSrcCst_OPERATION_FLOAT_NEON
540 
541 /****************************************************
542  * *
543  * The "DstAccSrc1Src2" group of functions *
544  * *
545  ****************************************************/
546 
548 
549 #define NE10_DstAccSrc1Src2_MAINLOOP_FLOAT_NEON(loopCode) { \
550  /* load 4 values */ \
551  n_acc = vld1q_f32( (float32_t*)acc ); \
552  n_src = vld1q_f32( (float32_t*)src1 ); \
553  n_src2 = vld1q_f32( (float32_t*)src2 ); \
554  acc += 4; /* move to the next 4 float items; 4*float */ \
555  src1 += 4; \
556  src2 += 4; \
557  loopCode; /* the actual operation is placed here... */ /* The main loop iterates through four float values each time */ \
558  vst1q_f32 ( (float32_t*)dst , n_dst ); /* store theresults back */ \
559  dst += 4; /* move to the next items; 4*float */ \
560  }
561 
562 #define NE10_DstAccSrc1Src2_SECONDLOOP_FLOAT_NEON(loopCode) { \
563  float32x2_t n_rest_acc = { 0.0f , 0.0f }; /* temporary storage to be used with NEON load/store intrinsics */ \
564  float32x2_t n_rest = { 0.0f , 0.0f }; \
565  float32x2_t n_rest2 = { 0.0f, 0.0f }; \
566  n_rest_acc = vld1_lane_f32 ( (float32_t*)acc, n_rest_acc, 0); /* load into the first lane of d0 */ \
567  n_rest = vld1_lane_f32 ( (float32_t*)src1, n_rest, 0); /* load into the first lane of d1 */ \
568  n_rest2 = vld1_lane_f32 ( (float32_t*)src2, n_rest2, 0); /* load into the first lane of d2 */ \
569  loopCode; /* the actual operation is palced here ... */ /* exceptional cases where the count is not a multiple of 4 */ \
570  vst1_lane_f32( (float32_t*)dst, n_rest, 0); /* store the lane back into the memory */ \
571  /* move to the next item in the stream */ \
572  acc++; \
573  src1++; \
574  src2++; \
575  dst++; \
576  }
577 
578 #define NE10_DstAccSrc1Src2_OPERATION_FLOAT_NEON NE10_DstAccSrcCst_OPERATION_FLOAT_NEON