Project Ne10
An open, optimized software library for the ARM architecture.
test_suite_math.c
Go to the documentation of this file.
1 /*
2  * Copyright 2012-16 ARM Limited and Contributors.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of ARM Limited nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /*
29  * NE10 Library : test_suite_math.c
30  */
31 
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <math.h>
35 
36 #include "NE10_math.h"
37 #include "seatest.h"
38 
39 //function table
40 ne10_func_2args_t ftbl_2args[MAX_FUNC_COUNT];
41 ne10_func_3args_t ftbl_3args[MAX_FUNC_COUNT];
42 ne10_func_4args_t ftbl_4args[MAX_FUNC_COUNT];
43 ne10_func_5args_t ftbl_5args[MAX_FUNC_COUNT];
44 ne10_func_3args_cst_t ftbl_3args_cst[MAX_FUNC_COUNT];
45 ne10_func_4args_cst_t ftbl_4args_cst[MAX_FUNC_COUNT];
46 ne10_func_5args_cst_t ftbl_5args_cst[MAX_FUNC_COUNT];
47 
48 //input and output
49 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
50 static ne10_float32_t * guarded_acc = NULL;
51 static ne10_float32_t * guarded_src1 = NULL;
52 static ne10_float32_t * guarded_src2 = NULL;
53 static ne10_float32_t * guarded_cst = NULL;
54 static ne10_float32_t * theacc = NULL;
55 static ne10_float32_t * thesrc1 = NULL;
56 static ne10_float32_t * thesrc2 = NULL;
57 static ne10_float32_t * thecst = NULL;
58 
59 static ne10_float32_t * guarded_dst_c = NULL;
60 static ne10_float32_t * guarded_dst_neon = NULL;
61 static ne10_float32_t * thedst_c = NULL;
62 static ne10_float32_t * thedst_neon = NULL;
63 #endif
64 
65 #ifdef PERFORMANCE_TEST
66 static ne10_float32_t * perftest_guarded_acc = NULL;
67 static ne10_float32_t * perftest_guarded_src1 = NULL;
68 static ne10_float32_t * perftest_guarded_src2 = NULL;
69 static ne10_float32_t * perftest_guarded_cst = NULL;
70 static ne10_float32_t * perftest_theacc = NULL;
71 static ne10_float32_t * perftest_thesrc1 = NULL;
72 static ne10_float32_t * perftest_thesrc2 = NULL;
73 static ne10_float32_t * perftest_thecst = NULL;
74 
75 static ne10_float32_t * perftest_thedst_c = NULL;
76 static ne10_float32_t * perftest_guarded_dst_c = NULL;
77 static ne10_float32_t * perftest_guarded_dst_neon = NULL;
78 static ne10_float32_t * perftest_thedst_neon = NULL;
79 static ne10_uint32_t perftest_length = 0;
80 
81 static ne10_int64_t time_c = 0;
82 static ne10_int64_t time_neon = 0;
83 static ne10_float32_t time_speedup = 0.0f;
84 static ne10_float32_t time_savings = 0.0f;
85 #endif
86 
87 void test_abs_case0()
88 {
89 #define MAX_VEC_COMPONENTS 4
90  ne10_int32_t loop;
91  ne10_int32_t func_loop;
92 
93  /* init function table */
94  memset (ftbl_3args, 0, sizeof (ftbl_3args));
95  ftbl_3args[ 0] = (ne10_func_3args_t) ne10_abs_float_c;
96  ftbl_3args[ 1] = (ne10_func_3args_t) ne10_abs_float_neon;
97  ftbl_3args[ 2] = (ne10_func_3args_t) ne10_abs_vec2f_c;
98  ftbl_3args[ 3] = (ne10_func_3args_t) ne10_abs_vec2f_neon;
99  ftbl_3args[ 4] = (ne10_func_3args_t) ne10_abs_vec3f_c;
100  ftbl_3args[ 5] = (ne10_func_3args_t) ne10_abs_vec3f_neon;
101  ftbl_3args[ 6] = (ne10_func_3args_t) ne10_abs_vec4f_c;
102  ftbl_3args[ 7] = (ne10_func_3args_t) ne10_abs_vec4f_neon;
103 
104  fprintf (stdout, "----------%30s start\n", __FUNCTION__);
105 
106 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
107  ne10_int32_t vec_size;
108  ne10_int32_t pos;
109  const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
110 
111  /* init src memory */
112  NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
113 
114  /* init dst memory */
115  NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
116  NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
117 
118  for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
119  {
120  for (loop = 0; loop < TEST_ITERATION; loop++)
121  {
122  vec_size = func_loop + 1;
123 
124  GUARD_ARRAY (thedst_c, loop * vec_size);
125  GUARD_ARRAY (thedst_neon, loop * vec_size);
126 
127  ftbl_3args[2 * func_loop] (thedst_c, thesrc1, loop);
128  ftbl_3args[2 * func_loop + 1] (thedst_neon, thesrc1, loop);
129 
130  assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
131  assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
132 
133  for (pos = 0; pos < loop; pos++)
134  {
135 #ifdef DEBUG_TRACE
136  ne10_int32_t i;
137  fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
138  for (i = 0; i < vec_size; i++)
139  {
140  fprintf (stdout, "thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
141  }
142 #endif
143  assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
144  }
145  }
146  }
147  free (guarded_src1);
148  free (guarded_dst_c);
149  free (guarded_dst_neon);
150 #endif
151 
152 #ifdef PERFORMANCE_TEST
153  fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time (micro-s)", "NEON Time (micro-s)", "Time Savings", "Performance Ratio");
154  perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
155  /* init src memory */
156  NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
157 
158  /* init dst memory */
159  NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
160  NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
161 
162  for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
163  {
164  GET_TIME (time_c,
165  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, loop);
166  );
167  GET_TIME (time_neon,
168  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, loop);
169  );
170  time_speedup = (ne10_float32_t) time_c / time_neon;
171  time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
172  ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
173  }
174 
175  free (perftest_guarded_src1);
176  free (perftest_guarded_dst_c);
177  free (perftest_guarded_dst_neon);
178 #endif
179 
180  fprintf (stdout, "----------%30s end\n", __FUNCTION__);
181 #undef MAX_VEC_COMPONENTS
182 }
183 
184 void test_addc_case0()
185 {
186 #define MAX_VEC_COMPONENTS 4
187  ne10_int32_t loop;
188  ne10_int32_t func_loop;
189 
190  fprintf (stdout, "----------%30s start\n", __FUNCTION__);
191 
192  /* init function table */
193  memset (ftbl_4args, 0, sizeof (ftbl_4args));
194  memset (ftbl_4args_cst, 0, sizeof (ftbl_4args_cst));
195  ftbl_4args_cst[ 0] = (ne10_func_4args_cst_t) ne10_addc_float_c;
196  ftbl_4args_cst[ 1] = (ne10_func_4args_cst_t) ne10_addc_float_neon;
197  ftbl_4args[ 2] = (ne10_func_4args_t) ne10_addc_vec2f_c;
198  ftbl_4args[ 3] = (ne10_func_4args_t) ne10_addc_vec2f_neon;
199  ftbl_4args[ 4] = (ne10_func_4args_t) ne10_addc_vec3f_c;
200  ftbl_4args[ 5] = (ne10_func_4args_t) ne10_addc_vec3f_neon;
201  ftbl_4args[ 6] = (ne10_func_4args_t) ne10_addc_vec4f_c;
202  ftbl_4args[ 7] = (ne10_func_4args_t) ne10_addc_vec4f_neon;
203 
204 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
205  ne10_int32_t vec_size;
206  ne10_int32_t pos;
207  const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
208 
209  /* init src memory */
210  NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
211  NE10_SRC_ALLOC_LIMIT (thecst, guarded_cst, MAX_VEC_COMPONENTS); // 16 extra bytes at the begining and 16 extra bytes at the end
212 
213  /* init dst memory */
214  NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
215  NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
216 
217  for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
218  {
219  for (loop = 0; loop < TEST_ITERATION; loop++)
220  {
221  vec_size = func_loop + 1;
222 
223  GUARD_ARRAY (thedst_c, loop * vec_size);
224  GUARD_ARRAY (thedst_neon, loop * vec_size);
225 
226  if (func_loop == 0)
227  {
228  ftbl_4args_cst[2 * func_loop] (thedst_c, thesrc1, thecst[0], loop);
229  ftbl_4args_cst[2 * func_loop + 1] (thedst_neon, thesrc1, thecst[0], loop);
230  }
231  else
232  {
233  ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thecst, loop);
234  ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thecst, loop);
235  }
236 
237 
238  assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
239  assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
240 
241  for (pos = 0; pos < loop; pos++)
242  {
243 #ifdef DEBUG_TRACE
244  ne10_int32_t i;
245  fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
246  for (i = 0; i < vec_size; i++)
247  {
248  fprintf (stdout, "thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
249  fprintf (stdout, "thecst->%d: %e [0x%04X] \n", i, thecst[i], * (ne10_uint32_t*) &thecst[i]);
250  }
251 #endif
252  assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
253  }
254  }
255  }
256  free (guarded_src1);
257  free (guarded_cst);
258  free (guarded_dst_c);
259  free (guarded_dst_neon);
260 #endif
261 
262 #ifdef PERFORMANCE_TEST
263  fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time (micro-s)", "NEON Time (micro-s)", "Time Savings", "Performance Ratio");
264  perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
265  /* init src memory */
266  NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
267  NE10_SRC_ALLOC_LIMIT (perftest_thecst, perftest_guarded_cst, MAX_VEC_COMPONENTS); // 16 extra bytes at the begining and 16 extra bytes at the end
268 
269  /* init dst memory */
270  NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
271  NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
272 
273  for (func_loop = 0; func_loop < 1; func_loop++)
274  {
275  GET_TIME (time_c,
276  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args_cst[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst[0], loop);
277  );
278  GET_TIME (time_neon,
279  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args_cst[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst[0], loop);
280  );
281  time_speedup = (ne10_float32_t) time_c / time_neon;
282  time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
283  ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
284  }
285  for (; func_loop < MAX_VEC_COMPONENTS; func_loop++)
286  {
287  GET_TIME (time_c,
288  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst, loop);
289  );
290  GET_TIME (time_neon,
291  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst, loop);
292  );
293  time_speedup = (ne10_float32_t) time_c / time_neon;
294  time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
295  ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
296  }
297 
298  free (perftest_guarded_src1);
299  free (perftest_guarded_cst);
300  free (perftest_guarded_dst_c);
301  free (perftest_guarded_dst_neon);
302 #endif
303 
304  fprintf (stdout, "----------%30s end\n", __FUNCTION__);
305 #undef MAX_VEC_COMPONENTS
306 }
307 
308 void test_add_case0()
309 {
310 #define MAX_VEC_COMPONENTS 4
311  ne10_int32_t loop;
312  ne10_int32_t func_loop;
313 
314  fprintf (stdout, "----------%30s start\n", __FUNCTION__);
315 
316  /* init function table */
317  memset (ftbl_4args, 0, sizeof (ftbl_4args));
318  ftbl_4args[ 0] = (ne10_func_4args_t) ne10_add_float_c;
319  ftbl_4args[ 1] = (ne10_func_4args_t) ne10_add_float_neon;
320  ftbl_4args[ 2] = (ne10_func_4args_t) ne10_add_vec2f_c;
321  ftbl_4args[ 3] = (ne10_func_4args_t) ne10_add_vec2f_neon;
322  ftbl_4args[ 4] = (ne10_func_4args_t) ne10_add_vec3f_c;
323  ftbl_4args[ 5] = (ne10_func_4args_t) ne10_add_vec3f_neon;
324  ftbl_4args[ 6] = (ne10_func_4args_t) ne10_add_vec4f_c;
325  ftbl_4args[ 7] = (ne10_func_4args_t) ne10_add_vec4f_neon;
326 
327 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
328  ne10_int32_t vec_size;
329  ne10_int32_t pos;
330  const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
331 
332  /* init src memory */
333  NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
334  NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
335 
336  /* init dst memory */
337  NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
338  NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
339 
340  for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
341  {
342  for (loop = 0; loop < TEST_ITERATION; loop++)
343  {
344  vec_size = func_loop + 1;
345 
346  GUARD_ARRAY (thedst_c, loop * vec_size);
347  GUARD_ARRAY (thedst_neon, loop * vec_size);
348 
349  ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
350  ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
351 
352  assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
353  assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
354 
355  for (pos = 0; pos < loop; pos++)
356  {
357 #ifdef DEBUG_TRACE
358  ne10_int32_t i;
359  fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
360  for (i = 0; i < vec_size; i++)
361  {
362  fprintf (stdout, "thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
363  fprintf (stdout, "thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
364  }
365 #endif
366  assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
367  }
368  }
369  }
370  free (guarded_src1);
371  free (guarded_src2);
372  free (guarded_dst_c);
373  free (guarded_dst_neon);
374 #endif
375 
376 #ifdef PERFORMANCE_TEST
377  fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time (micro-s)", "NEON Time (micro-s)", "Time Savings", "Performance Ratio");
378  perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
379  /* init src memory */
380  NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
381  NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
382 
383  /* init dst memory */
384  NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
385  NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
386 
387  for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
388  {
389  GET_TIME (time_c,
390  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
391  );
392  GET_TIME (time_neon,
393  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
394  );
395  time_speedup = (ne10_float32_t) time_c / time_neon;
396  time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
397  ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
398  }
399 
400  free (perftest_guarded_src1);
401  free (perftest_guarded_src2);
402  free (perftest_guarded_dst_c);
403  free (perftest_guarded_dst_neon);
404 #endif
405 
406  fprintf (stdout, "----------%30s end\n", __FUNCTION__);
407 #undef MAX_VEC_COMPONENTS
408 }
409 
410 void test_cross_case0()
411 {
412 #define MAX_VEC_COMPONENTS 3
413  ne10_int32_t loop;
414  ne10_int32_t func_loop;
415 
416  fprintf (stdout, "----------%30s start\n", __FUNCTION__);
417 
418  /* init function table */
419  memset (ftbl_4args, 0, sizeof (ftbl_4args));
420  ftbl_4args[ 4] = (ne10_func_4args_t) ne10_cross_vec3f_c;
421  ftbl_4args[ 5] = (ne10_func_4args_t) ne10_cross_vec3f_neon;
422 
423 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
424  ne10_int32_t vec_size;
425  ne10_int32_t pos;
426  const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
427 
428  /* init src memory */
429  NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
430  NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
431 
432  /* init dst memory */
433  NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
434  NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
435 
436  for (func_loop = 2; func_loop < MAX_VEC_COMPONENTS; func_loop++)
437  {
438  for (loop = 0; loop < TEST_ITERATION; loop++)
439  {
440  vec_size = func_loop + 1;
441 
442  GUARD_ARRAY (thedst_c, loop * vec_size);
443  GUARD_ARRAY (thedst_neon, loop * vec_size);
444 
445  ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
446  ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
447 
448  assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
449  assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
450 
451  for (pos = 0; pos < loop; pos++)
452  {
453 #ifdef DEBUG_TRACE
454  ne10_int32_t i;
455  fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
456  for (i = 0; i < vec_size; i++)
457  {
458  fprintf (stdout, "thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
459  fprintf (stdout, "thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
460  }
461 #endif
462  assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
463  }
464  }
465  }
466  free (guarded_src1);
467  free (guarded_src2);
468  free (guarded_dst_c);
469  free (guarded_dst_neon);
470 #endif
471 
472 #ifdef PERFORMANCE_TEST
473  fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time (micro-s)", "NEON Time (micro-s)", "Time Savings", "Performance Ratio");
474  perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
475  /* init src memory */
476  NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
477  NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
478 
479  /* init dst memory */
480  NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
481  NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
482 
483  for (func_loop = 2; func_loop < MAX_VEC_COMPONENTS; func_loop++)
484  {
485  GET_TIME (time_c,
486  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
487  );
488  GET_TIME (time_neon,
489  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
490  );
491  time_speedup = (ne10_float32_t) time_c / time_neon;
492  time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
493  ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
494  }
495 
496  free (perftest_guarded_src1);
497  free (perftest_guarded_src2);
498  free (perftest_guarded_dst_c);
499  free (perftest_guarded_dst_neon);
500 #endif
501 
502  fprintf (stdout, "----------%30s end\n", __FUNCTION__);
503 #undef MAX_VEC_COMPONENTS
504 }
505 
506 void test_divc_case0()
507 {
508 #define MAX_VEC_COMPONENTS 4
509  ne10_int32_t loop;
510  ne10_int32_t func_loop;
511 
512  fprintf (stdout, "----------%30s start\n", __FUNCTION__);
513 
514  /* init function table */
515  memset (ftbl_4args, 0, sizeof (ftbl_4args));
516  memset (ftbl_4args_cst, 0, sizeof (ftbl_4args_cst));
517  ftbl_4args_cst[ 0] = (ne10_func_4args_cst_t) ne10_divc_float_c;
518  ftbl_4args_cst[ 1] = (ne10_func_4args_cst_t) ne10_divc_float_neon;
519  ftbl_4args[ 2] = (ne10_func_4args_t) ne10_divc_vec2f_c;
520  ftbl_4args[ 3] = (ne10_func_4args_t) ne10_divc_vec2f_neon;
521  ftbl_4args[ 4] = (ne10_func_4args_t) ne10_divc_vec3f_c;
522  ftbl_4args[ 5] = (ne10_func_4args_t) ne10_divc_vec3f_neon;
523  ftbl_4args[ 6] = (ne10_func_4args_t) ne10_divc_vec4f_c;
524  ftbl_4args[ 7] = (ne10_func_4args_t) ne10_divc_vec4f_neon;
525 
526 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
527  ne10_int32_t vec_size;
528  ne10_int32_t pos;
529  const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
530 
531  /* init src memory */
532  NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
533  NE10_SRC_ALLOC_LIMIT (thecst, guarded_cst, MAX_VEC_COMPONENTS); // 16 extra bytes at the begining and 16 extra bytes at the end
534 
535  /* init dst memory */
536  NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
537  NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
538 
539  for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
540  {
541  for (loop = 0; loop < TEST_ITERATION; loop++)
542  {
543  vec_size = func_loop + 1;
544 
545  GUARD_ARRAY (thedst_c, loop * vec_size);
546  GUARD_ARRAY (thedst_neon, loop * vec_size);
547 
548  if (func_loop == 0)
549  {
550  ftbl_4args_cst[2 * func_loop] (thedst_c, thesrc1, thecst[0], loop);
551  ftbl_4args_cst[2 * func_loop + 1] (thedst_neon, thesrc1, thecst[0], loop);
552  }
553  else
554  {
555  ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thecst, loop);
556  ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thecst, loop);
557  }
558 
559  assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
560  assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
561 
562  for (pos = 0; pos < loop; pos++)
563  {
564 #ifdef DEBUG_TRACE
565  ne10_int32_t i;
566  fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
567  for (i = 0; i < vec_size; i++)
568  {
569  fprintf (stdout, "thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
570  fprintf (stdout, "thecst->%d: %e [0x%04X] \n", i, thecst[i], * (ne10_uint32_t*) &thecst[i]);
571  }
572 #endif
573  assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
574  }
575  }
576  }
577  free (guarded_src1);
578  free (guarded_cst);
579  free (guarded_dst_c);
580  free (guarded_dst_neon);
581 #endif
582 
583 #ifdef PERFORMANCE_TEST
584  fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time (micro-s)", "NEON Time (micro-s)", "Time Savings", "Performance Ratio");
585  perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
586  /* init src memory */
587  NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
588  NE10_SRC_ALLOC_LIMIT (perftest_thecst, perftest_guarded_cst, MAX_VEC_COMPONENTS); // 16 extra bytes at the begining and 16 extra bytes at the end
589 
590  /* init dst memory */
591  NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
592  NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
593 
594  for (func_loop = 0; func_loop < 1; func_loop++)
595  {
596  GET_TIME (time_c,
597  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args_cst[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst[0], loop);
598  );
599  GET_TIME (time_neon,
600  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args_cst[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst[0], loop);
601  );
602  time_speedup = (ne10_float32_t) time_c / time_neon;
603  time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
604  ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
605  }
606  for (; func_loop < MAX_VEC_COMPONENTS; func_loop++)
607  {
608  GET_TIME (time_c,
609  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst, loop);
610  );
611  GET_TIME (time_neon,
612  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst, loop);
613  );
614  time_speedup = (ne10_float32_t) time_c / time_neon;
615  time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
616  ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
617  }
618 
619  free (perftest_guarded_src1);
620  free (perftest_guarded_cst);
621  free (perftest_guarded_dst_c);
622  free (perftest_guarded_dst_neon);
623 #endif
624 
625  fprintf (stdout, "----------%30s end\n", __FUNCTION__);
626 #undef MAX_VEC_COMPONENTS
627 }
628 
629 void test_div_case0()
630 {
631 #define MAX_VEC_COMPONENTS 4
632  ne10_int32_t loop;
633  ne10_int32_t func_loop;
634 
635  fprintf (stdout, "----------%30s start\n", __FUNCTION__);
636 
637  /* init function table */
638  memset (ftbl_4args, 0, sizeof (ftbl_4args));
639  ftbl_4args[ 0] = (ne10_func_4args_t) ne10_div_float_c;
640  ftbl_4args[ 1] = (ne10_func_4args_t) ne10_div_float_neon;
641  ftbl_4args[ 2] = (ne10_func_4args_t) ne10_vdiv_vec2f_c;
642  ftbl_4args[ 3] = (ne10_func_4args_t) ne10_vdiv_vec2f_neon;
643  ftbl_4args[ 4] = (ne10_func_4args_t) ne10_vdiv_vec3f_c;
644  ftbl_4args[ 5] = (ne10_func_4args_t) ne10_vdiv_vec3f_neon;
645  ftbl_4args[ 6] = (ne10_func_4args_t) ne10_vdiv_vec4f_c;
646  ftbl_4args[ 7] = (ne10_func_4args_t) ne10_vdiv_vec4f_neon;
647 
648 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
649  ne10_int32_t vec_size;
650  ne10_int32_t pos;
651  const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
652 
653  /* init src memory */
654  NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
655  NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
656 
657  /* init dst memory */
658  NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
659  NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
660 
661  for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
662  {
663  for (loop = 0; loop < TEST_ITERATION; loop++)
664  {
665  vec_size = func_loop + 1;
666 
667  GUARD_ARRAY (thedst_c, loop * vec_size);
668  GUARD_ARRAY (thedst_neon, loop * vec_size);
669 
670  ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
671  ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
672 
673  assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
674  assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
675 
676  for (pos = 0; pos < loop; pos++)
677  {
678 #ifdef DEBUG_TRACE
679  ne10_int32_t i;
680  fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
681  for (i = 0; i < vec_size; i++)
682  {
683  fprintf (stdout, "thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
684  fprintf (stdout, "thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
685  }
686 #endif
687  assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_LARGE, vec_size);
688  }
689  }
690  }
691  free (guarded_src1);
692  free (guarded_src2);
693  free (guarded_dst_c);
694  free (guarded_dst_neon);
695 #endif
696 
697 #ifdef PERFORMANCE_TEST
698  fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time (micro-s)", "NEON Time (micro-s)", "Time Savings", "Performance Ratio");
699  perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
700  /* init src memory */
701  NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
702  NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
703 
704  /* init dst memory */
705  NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
706  NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
707 
708  for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
709  {
710  GET_TIME (time_c,
711  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
712  );
713  GET_TIME (time_neon,
714  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
715  );
716  time_speedup = (ne10_float32_t) time_c / time_neon;
717  time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
718  ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
719  }
720 
721  free (perftest_guarded_src1);
722  free (perftest_guarded_src2);
723  free (perftest_guarded_dst_c);
724  free (perftest_guarded_dst_neon);
725 #endif
726 
727  fprintf (stdout, "----------%30s end\n", __FUNCTION__);
728 #undef MAX_VEC_COMPONENTS
729 }
730 
731 void test_dot_case0()
732 {
733 #define MAX_VEC_COMPONENTS 4
734  ne10_int32_t loop;
735  ne10_int32_t func_loop;
736 
737  fprintf (stdout, "----------%30s start\n", __FUNCTION__);
738 
739  /* init function table */
740  memset (ftbl_4args, 0, sizeof (ftbl_4args));
741  ftbl_4args[ 2] = (ne10_func_4args_t) ne10_dot_vec2f_c;
742  ftbl_4args[ 3] = (ne10_func_4args_t) ne10_dot_vec2f_neon;
743  ftbl_4args[ 4] = (ne10_func_4args_t) ne10_dot_vec3f_c;
744  ftbl_4args[ 5] = (ne10_func_4args_t) ne10_dot_vec3f_neon;
745  ftbl_4args[ 6] = (ne10_func_4args_t) ne10_dot_vec4f_c;
746  ftbl_4args[ 7] = (ne10_func_4args_t) ne10_dot_vec4f_neon;
747 
748 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
749  ne10_int32_t pos;
750  const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
751 
752  /* init src memory */
753  NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
754  NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
755 
756  /* init dst memory */
757  NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
758  NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
759 
760  for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
761  {
762  for (loop = 0; loop < TEST_ITERATION; loop++)
763  {
764 #ifdef DEBUG_TRACE
765  ne10_int32_t vec_size = func_loop + 1;
766 #endif
767 
768  GUARD_ARRAY (thedst_c, loop);
769  GUARD_ARRAY (thedst_neon, loop);
770 
771  ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
772  ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
773 
774  assert_true (CHECK_ARRAY_GUARD (thedst_c, loop));
775  assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop));
776 
777  for (pos = 0; pos < loop; pos++)
778  {
779 #ifdef DEBUG_TRACE
780  ne10_int32_t i;
781  fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
782  for (i = 0; i < vec_size; i++)
783  {
784  fprintf (stdout, "thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
785  fprintf (stdout, "thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
786  }
787 #endif
788  assert_float_vec_equal (&thedst_c[pos], &thedst_neon[pos], ERROR_MARGIN_SMALL, 1);
789  }
790  }
791  }
792  free (guarded_src1);
793  free (guarded_src2);
794  free (guarded_dst_c);
795  free (guarded_dst_neon);
796 #endif
797 
798 #ifdef PERFORMANCE_TEST
799  fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time (micro-s)", "NEON Time (micro-s)", "Time Savings", "Performance Ratio");
800  perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
801  /* init src memory */
802  NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
803  NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
804 
805  /* init dst memory */
806  NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
807  NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
808 
809  for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
810  {
811  GET_TIME (time_c,
812  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
813  );
814  GET_TIME (time_neon,
815  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
816  );
817  time_speedup = (ne10_float32_t) time_c / time_neon;
818  time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
819  ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
820  }
821 
822  free (perftest_guarded_src1);
823  free (perftest_guarded_src2);
824  free (perftest_guarded_dst_c);
825  free (perftest_guarded_dst_neon);
826 #endif
827 
828  fprintf (stdout, "----------%30s end\n", __FUNCTION__);
829 #undef MAX_VEC_COMPONENTS
830 }
831 
832 void test_len_case0()
833 {
834 #define MAX_VEC_COMPONENTS 4
835  ne10_int32_t loop;
836  ne10_int32_t func_loop;
837 
838  /* init function table */
839  memset (ftbl_3args, 0, sizeof (ftbl_3args));
840  ftbl_3args[ 2] = (ne10_func_3args_t) ne10_len_vec2f_c;
841  ftbl_3args[ 3] = (ne10_func_3args_t) ne10_len_vec2f_neon;
842  ftbl_3args[ 4] = (ne10_func_3args_t) ne10_len_vec3f_c;
843  ftbl_3args[ 5] = (ne10_func_3args_t) ne10_len_vec3f_neon;
844  ftbl_3args[ 6] = (ne10_func_3args_t) ne10_len_vec4f_c;
845  ftbl_3args[ 7] = (ne10_func_3args_t) ne10_len_vec4f_neon;
846 
847  fprintf (stdout, "----------%30s start\n", __FUNCTION__);
848 
849 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
850  ne10_int32_t vec_size;
851  ne10_int32_t pos;
852  const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
853 
854  /* init src memory */
855  NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
856 
857  /* init dst memory */
858  NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
859  NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
860 
861  for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
862  {
863  for (loop = 0; loop < TEST_ITERATION; loop++)
864  {
865  vec_size = func_loop + 1;
866 
867  GUARD_ARRAY (thedst_c, loop);
868  GUARD_ARRAY (thedst_neon, loop);
869 
870  ftbl_3args[2 * func_loop] (thedst_c, thesrc1, loop);
871  ftbl_3args[2 * func_loop + 1] (thedst_neon, thesrc1, loop);
872 
873  assert_true (CHECK_ARRAY_GUARD (thedst_c, loop));
874  assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop));
875 
876  for (pos = 0; pos < loop; pos++)
877  {
878 #ifdef DEBUG_TRACE
879  ne10_int32_t i;
880  fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
881  for (i = 0; i < vec_size; i++)
882  {
883  fprintf (stdout, "thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
884  }
885 #endif
886  assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_LARGE, 1);
887  }
888  }
889  }
890  free (guarded_src1);
891  free (guarded_dst_c);
892  free (guarded_dst_neon);
893 #endif
894 
895 #ifdef PERFORMANCE_TEST
896  fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time (micro-s)", "NEON Time (micro-s)", "Time Savings", "Performance Ratio");
897  perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
898  /* init src memory */
899  NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
900 
901  /* init dst memory */
902  NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
903  NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
904 
905  for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
906  {
907  GET_TIME (time_c,
908  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, loop);
909  );
910  GET_TIME (time_neon,
911  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, loop);
912  );
913  time_speedup = (ne10_float32_t) time_c / time_neon;
914  time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
915  ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
916  }
917 
918  free (perftest_guarded_src1);
919  free (perftest_guarded_dst_c);
920  free (perftest_guarded_dst_neon);
921 #endif
922 
923  fprintf (stdout, "----------%30s end\n", __FUNCTION__);
924 #undef MAX_VEC_COMPONENTS
925 }
926 
927 void test_mlac_case0()
928 {
929 #define MAX_VEC_COMPONENTS 4
930  ne10_int32_t loop;
931  ne10_int32_t func_loop;
932 
933  fprintf (stdout, "----------%30s start\n", __FUNCTION__);
934 
935  /* init function table */
936  memset (ftbl_5args, 0, sizeof (ftbl_5args));
937  memset (ftbl_5args_cst, 0, sizeof (ftbl_5args_cst));
938  ftbl_5args_cst[ 0] = (ne10_func_5args_cst_t) ne10_mlac_float_c;
939  ftbl_5args_cst[ 1] = (ne10_func_5args_cst_t) ne10_mlac_float_neon;
940  ftbl_5args[ 2] = (ne10_func_5args_t) ne10_mlac_vec2f_c;
941  ftbl_5args[ 3] = (ne10_func_5args_t) ne10_mlac_vec2f_neon;
942  ftbl_5args[ 4] = (ne10_func_5args_t) ne10_mlac_vec3f_c;
943  ftbl_5args[ 5] = (ne10_func_5args_t) ne10_mlac_vec3f_neon;
944  ftbl_5args[ 6] = (ne10_func_5args_t) ne10_mlac_vec4f_c;
945  ftbl_5args[ 7] = (ne10_func_5args_t) ne10_mlac_vec4f_neon;
946 
947 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
948  ne10_int32_t vec_size;
949  ne10_int32_t pos;
950  const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
951 
952  /* init src memory */
953  NE10_SRC_ALLOC_LIMIT (theacc, guarded_acc, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
954  NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
955  NE10_SRC_ALLOC_LIMIT (thecst, guarded_cst, MAX_VEC_COMPONENTS); // 16 extra bytes at the begining and 16 extra bytes at the end
956 
957  /* init dst memory */
958  NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
959  NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
960 
961  for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
962  {
963  for (loop = 0; loop < TEST_ITERATION; loop++)
964  {
965  vec_size = func_loop + 1;
966 
967  GUARD_ARRAY (thedst_c, loop * vec_size);
968  GUARD_ARRAY (thedst_neon, loop * vec_size);
969 
970  if (func_loop == 0)
971  {
972  ftbl_5args_cst[2 * func_loop] (thedst_c, theacc, thesrc1, thecst[0], loop);
973  ftbl_5args_cst[2 * func_loop + 1] (thedst_neon, theacc, thesrc1, thecst[0], loop);
974  }
975  else
976  {
977  ftbl_5args[2 * func_loop] (thedst_c, theacc, thesrc1, thecst, loop);
978  ftbl_5args[2 * func_loop + 1] (thedst_neon, theacc, thesrc1, thecst, loop);
979  }
980 
981  assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
982  assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
983 
984  for (pos = 0; pos < loop; pos++)
985  {
986 #ifdef DEBUG_TRACE
987  ne10_int32_t i;
988  fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
989  for (i = 0; i < vec_size; i++)
990  {
991  fprintf (stdout, "theacc->%d: %f [0x%04X] \n", i, theacc[pos * vec_size + i], * (ne10_uint32_t*) &theacc[pos * vec_size + i]);
992  fprintf (stdout, "thesrc->%d: %f [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
993  fprintf (stdout, "thecst->%d: %f [0x%04X] \n", i, thecst[i], * (ne10_uint32_t*) &thecst[i]);
994  }
995 #endif
996  assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
997  }
998  }
999  }
1000  free (guarded_acc);
1001  free (guarded_src1);
1002  free (guarded_cst);
1003  free (guarded_dst_c);
1004  free (guarded_dst_neon);
1005 #endif
1006 
1007 #ifdef PERFORMANCE_TEST
1008  fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time (micro-s)", "NEON Time (micro-s)", "Time Savings", "Performance Ratio");
1009  perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
1010  /* init src memory */
1011  NE10_SRC_ALLOC_LIMIT (perftest_theacc, perftest_guarded_acc, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1012  NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1013  NE10_SRC_ALLOC_LIMIT (perftest_thecst, perftest_guarded_cst, MAX_VEC_COMPONENTS); // 16 extra bytes at the begining and 16 extra bytes at the end
1014 
1015  /* init dst memory */
1016  NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1017  NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1018 
1019  for (func_loop = 0; func_loop < 1; func_loop++)
1020  {
1021  GET_TIME (time_c,
1022  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_5args_cst[2 * func_loop] (perftest_thedst_c, perftest_theacc, perftest_thesrc1, perftest_thecst[0], loop);
1023  );
1024  GET_TIME (time_neon,
1025  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_5args_cst[2 * func_loop + 1] (perftest_thedst_neon, perftest_theacc, perftest_thesrc1, perftest_thecst[0], loop);
1026  );
1027  time_speedup = (ne10_float32_t) time_c / time_neon;
1028  time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1029  ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1030  }
1031  for (; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1032  {
1033  GET_TIME (time_c,
1034  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_5args[2 * func_loop] (perftest_thedst_c, perftest_theacc, perftest_thesrc1, perftest_thecst, loop);
1035  );
1036  GET_TIME (time_neon,
1037  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_5args[2 * func_loop + 1] (perftest_thedst_neon, perftest_theacc, perftest_thesrc1, perftest_thecst, loop);
1038  );
1039  time_speedup = (ne10_float32_t) time_c / time_neon;
1040  time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1041  ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1042  }
1043 
1044  free (perftest_guarded_acc);
1045  free (perftest_guarded_src1);
1046  free (perftest_guarded_cst);
1047  free (perftest_guarded_dst_c);
1048  free (perftest_guarded_dst_neon);
1049 #endif
1050 
1051  fprintf (stdout, "----------%30s end\n", __FUNCTION__);
1052 #undef MAX_VEC_COMPONENTS
1053 }
1054 
1055 void test_mla_case0()
1056 {
1057 #define MAX_VEC_COMPONENTS 4
1058  ne10_int32_t loop;
1059  ne10_int32_t func_loop;
1060 
1061  fprintf (stdout, "----------%30s start\n", __FUNCTION__);
1062 
1063  /* init function table */
1064  memset (ftbl_5args, 0, sizeof (ftbl_5args));
1065  ftbl_5args[ 0] = (ne10_func_5args_t) ne10_mla_float_c;
1066  ftbl_5args[ 1] = (ne10_func_5args_t) ne10_mla_float_neon;
1067  ftbl_5args[ 2] = (ne10_func_5args_t) ne10_vmla_vec2f_c;
1068  ftbl_5args[ 3] = (ne10_func_5args_t) ne10_vmla_vec2f_neon;
1069  ftbl_5args[ 4] = (ne10_func_5args_t) ne10_vmla_vec3f_c;
1070  ftbl_5args[ 5] = (ne10_func_5args_t) ne10_vmla_vec3f_neon;
1071  ftbl_5args[ 6] = (ne10_func_5args_t) ne10_vmla_vec4f_c;
1072  ftbl_5args[ 7] = (ne10_func_5args_t) ne10_vmla_vec4f_neon;
1073 
1074 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
1075  ne10_int32_t vec_size;
1076  ne10_int32_t pos;
1077  const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
1078 
1079  /* init src memory */
1080  NE10_SRC_ALLOC_LIMIT (theacc, guarded_acc, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1081  NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1082  NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1083 
1084  /* init dst memory */
1085  NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1086  NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1087 
1088  for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1089  {
1090  for (loop = 0; loop < TEST_ITERATION; loop++)
1091  {
1092  vec_size = func_loop + 1;
1093 
1094  GUARD_ARRAY (thedst_c, loop * vec_size);
1095  GUARD_ARRAY (thedst_neon, loop * vec_size);
1096 
1097  ftbl_5args[2 * func_loop] (thedst_c, theacc, thesrc1, thesrc2, loop);
1098  ftbl_5args[2 * func_loop + 1] (thedst_neon, theacc, thesrc1, thesrc2, loop);
1099 
1100  assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
1101  assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
1102 
1103  for (pos = 0; pos < loop; pos++)
1104  {
1105 #ifdef DEBUG_TRACE
1106  ne10_int32_t i;
1107  fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1108  for (i = 0; i < vec_size; i++)
1109  {
1110  fprintf (stdout, "theacc->%d: %e [0x%04X] \n", i, theacc[pos * vec_size + i], * (ne10_uint32_t*) &theacc[pos * vec_size + i]);
1111  fprintf (stdout, "thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
1112  fprintf (stdout, "thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
1113  }
1114 #endif
1115  assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
1116  }
1117  }
1118  }
1119  free (guarded_acc);
1120  free (guarded_src1);
1121  free (guarded_src2);
1122  free (guarded_dst_c);
1123  free (guarded_dst_neon);
1124 #endif
1125 
1126 #ifdef PERFORMANCE_TEST
1127  fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time (micro-s)", "NEON Time (micro-s)", "Time Savings", "Performance Ratio");
1128  perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
1129  /* init src memory */
1130  NE10_SRC_ALLOC_LIMIT (perftest_theacc, perftest_guarded_acc, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1131  NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1132  NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1133 
1134  /* init dst memory */
1135  NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1136  NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1137 
1138  for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1139  {
1140  GET_TIME (time_c,
1141  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_5args[2 * func_loop] (perftest_thedst_c, perftest_theacc, perftest_thesrc1, perftest_thesrc2, loop);
1142  );
1143  GET_TIME (time_neon,
1144  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_5args[2 * func_loop + 1] (perftest_thedst_neon, perftest_theacc, perftest_thesrc1, perftest_thesrc2, loop);
1145  );
1146  time_speedup = (ne10_float32_t) time_c / time_neon;
1147  time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1148  ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1149  }
1150 
1151  free (perftest_guarded_acc);
1152  free (perftest_guarded_src1);
1153  free (perftest_guarded_src2);
1154  free (perftest_guarded_dst_c);
1155  free (perftest_guarded_dst_neon);
1156 #endif
1157 
1158  fprintf (stdout, "----------%30s end\n", __FUNCTION__);
1159 #undef MAX_VEC_COMPONENTS
1160 }
1161 
1162 void test_mulc_case0()
1163 {
1164 #define MAX_VEC_COMPONENTS 4
1165  ne10_int32_t loop;
1166  ne10_int32_t func_loop;
1167 
1168  fprintf (stdout, "----------%30s start\n", __FUNCTION__);
1169 
1170  /* init function table */
1171  memset (ftbl_4args, 0, sizeof (ftbl_4args));
1172  memset (ftbl_4args_cst, 0, sizeof (ftbl_4args_cst));
1173  ftbl_4args_cst[ 0] = (ne10_func_4args_cst_t) ne10_mulc_float_c;
1174  ftbl_4args_cst[ 1] = (ne10_func_4args_cst_t) ne10_mulc_float_neon;
1175  ftbl_4args[ 2] = (ne10_func_4args_t) ne10_mulc_vec2f_c;
1176  ftbl_4args[ 3] = (ne10_func_4args_t) ne10_mulc_vec2f_neon;
1177  ftbl_4args[ 4] = (ne10_func_4args_t) ne10_mulc_vec3f_c;
1178  ftbl_4args[ 5] = (ne10_func_4args_t) ne10_mulc_vec3f_neon;
1179  ftbl_4args[ 6] = (ne10_func_4args_t) ne10_mulc_vec4f_c;
1180  ftbl_4args[ 7] = (ne10_func_4args_t) ne10_mulc_vec4f_neon;
1181 
1182 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
1183  ne10_int32_t vec_size;
1184  ne10_int32_t pos;
1185  const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
1186 
1187  /* init src memory */
1188  NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1189  NE10_SRC_ALLOC_LIMIT (thecst, guarded_cst, MAX_VEC_COMPONENTS); // 16 extra bytes at the begining and 16 extra bytes at the end
1190 
1191  /* init dst memory */
1192  NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1193  NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1194 
1195  for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1196  {
1197  for (loop = 0; loop < TEST_ITERATION; loop++)
1198  {
1199  vec_size = func_loop + 1;
1200 
1201  GUARD_ARRAY (thedst_c, loop * vec_size);
1202  GUARD_ARRAY (thedst_neon, loop * vec_size);
1203 
1204  if (func_loop == 0)
1205  {
1206  ftbl_4args_cst[2 * func_loop] (thedst_c, thesrc1, thecst[0], loop);
1207  ftbl_4args_cst[2 * func_loop + 1] (thedst_neon, thesrc1, thecst[0], loop);
1208  }
1209  else
1210  {
1211  ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thecst, loop);
1212  ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thecst, loop);
1213  }
1214 
1215  assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
1216  assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
1217 
1218  for (pos = 0; pos < loop; pos++)
1219  {
1220 #ifdef DEBUG_TRACE
1221  ne10_int32_t i;
1222  fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1223  for (i = 0; i < vec_size; i++)
1224  {
1225  fprintf (stdout, "thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
1226  fprintf (stdout, "thecst->%d: %e [0x%04X] \n", i, thecst[i], * (ne10_uint32_t*) &thecst[i]);
1227  }
1228 #endif
1229  assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
1230  }
1231  }
1232  }
1233  free (guarded_src1);
1234  free (guarded_cst);
1235  free (guarded_dst_c);
1236  free (guarded_dst_neon);
1237 #endif
1238 
1239 #ifdef PERFORMANCE_TEST
1240  fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time (micro-s)", "NEON Time (micro-s)", "Time Savings", "Performance Ratio");
1241  perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
1242  /* init src memory */
1243  NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1244  NE10_SRC_ALLOC_LIMIT (perftest_thecst, perftest_guarded_cst, MAX_VEC_COMPONENTS); // 16 extra bytes at the begining and 16 extra bytes at the end
1245 
1246  /* init dst memory */
1247  NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1248  NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1249 
1250  for (func_loop = 0; func_loop < 1; func_loop++)
1251  {
1252  GET_TIME (time_c,
1253  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args_cst[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst[0], loop);
1254  );
1255  GET_TIME (time_neon,
1256  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args_cst[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst[0], loop);
1257  );
1258  time_speedup = (ne10_float32_t) time_c / time_neon;
1259  time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1260  ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1261  }
1262  for (; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1263  {
1264  GET_TIME (time_c,
1265  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst, loop);
1266  );
1267  GET_TIME (time_neon,
1268  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst, loop);
1269  );
1270  time_speedup = (ne10_float32_t) time_c / time_neon;
1271  time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1272  ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1273  }
1274 
1275  free (perftest_guarded_src1);
1276  free (perftest_guarded_cst);
1277  free (perftest_guarded_dst_c);
1278  free (perftest_guarded_dst_neon);
1279 #endif
1280 
1281  fprintf (stdout, "----------%30s end\n", __FUNCTION__);
1282 #undef MAX_VEC_COMPONENTS
1283 }
1284 
1285 void test_mul_case0()
1286 {
1287 #define MAX_VEC_COMPONENTS 4
1288  ne10_int32_t loop;
1289  ne10_int32_t func_loop;
1290 
1291  fprintf (stdout, "----------%30s start\n", __FUNCTION__);
1292 
1293  /* init function table */
1294  memset (ftbl_4args, 0, sizeof (ftbl_4args));
1295  ftbl_4args[ 0] = (ne10_func_4args_t) ne10_mul_float_c;
1296  ftbl_4args[ 1] = (ne10_func_4args_t) ne10_mul_float_neon;
1297  ftbl_4args[ 2] = (ne10_func_4args_t) ne10_vmul_vec2f_c;
1298  ftbl_4args[ 3] = (ne10_func_4args_t) ne10_vmul_vec2f_neon;
1299  ftbl_4args[ 4] = (ne10_func_4args_t) ne10_vmul_vec3f_c;
1300  ftbl_4args[ 5] = (ne10_func_4args_t) ne10_vmul_vec3f_neon;
1301  ftbl_4args[ 6] = (ne10_func_4args_t) ne10_vmul_vec4f_c;
1302  ftbl_4args[ 7] = (ne10_func_4args_t) ne10_vmul_vec4f_neon;
1303 
1304 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
1305  ne10_int32_t vec_size;
1306  ne10_int32_t pos;
1307  const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
1308 
1309  /* init src memory */
1310  NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1311  NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1312 
1313  /* init dst memory */
1314  NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1315  NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1316 
1317  for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1318  {
1319  for (loop = 0; loop < TEST_ITERATION; loop++)
1320  {
1321  vec_size = func_loop + 1;
1322 
1323  GUARD_ARRAY (thedst_c, loop * vec_size);
1324  GUARD_ARRAY (thedst_neon, loop * vec_size);
1325 
1326  ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
1327  ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
1328 
1329  assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
1330  assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
1331 
1332  for (pos = 0; pos < loop; pos++)
1333  {
1334 #ifdef DEBUG_TRACE
1335  ne10_int32_t i;
1336  fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1337  for (i = 0; i < vec_size; i++)
1338  {
1339  fprintf (stdout, "thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
1340  fprintf (stdout, "thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
1341  }
1342 #endif
1343  assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
1344  }
1345  }
1346  }
1347  free (guarded_src1);
1348  free (guarded_src2);
1349  free (guarded_dst_c);
1350  free (guarded_dst_neon);
1351 #endif
1352 
1353 #ifdef PERFORMANCE_TEST
1354  fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time (micro-s)", "NEON Time (micro-s)", "Time Savings", "Performance Ratio");
1355  perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
1356  /* init src memory */
1357  NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1358  NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1359 
1360  /* init dst memory */
1361  NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1362  NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1363 
1364  for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1365  {
1366  GET_TIME (time_c,
1367  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
1368  );
1369  GET_TIME (time_neon,
1370  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
1371  );
1372  time_speedup = (ne10_float32_t) time_c / time_neon;
1373  time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1374  ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1375  }
1376 
1377  free (perftest_guarded_src1);
1378  free (perftest_guarded_src2);
1379  free (perftest_guarded_dst_c);
1380  free (perftest_guarded_dst_neon);
1381 #endif
1382 
1383  fprintf (stdout, "----------%30s end\n", __FUNCTION__);
1384 #undef MAX_VEC_COMPONENTS
1385 }
1386 
1387 void test_normalize_case0()
1388 {
1389 #define MAX_VEC_COMPONENTS 4
1390  ne10_int32_t loop;
1391  ne10_int32_t func_loop;
1392 
1393  /* init function table */
1394  memset (ftbl_3args, 0, sizeof (ftbl_3args));
1395  ftbl_3args[ 2] = (ne10_func_3args_t) ne10_normalize_vec2f_c;
1396  ftbl_3args[ 3] = (ne10_func_3args_t) ne10_normalize_vec2f_neon;
1397  ftbl_3args[ 4] = (ne10_func_3args_t) ne10_normalize_vec3f_c;
1398  ftbl_3args[ 5] = (ne10_func_3args_t) ne10_normalize_vec3f_neon;
1399  ftbl_3args[ 6] = (ne10_func_3args_t) ne10_normalize_vec4f_c;
1400  ftbl_3args[ 7] = (ne10_func_3args_t) ne10_normalize_vec4f_neon;
1401 
1402  fprintf (stdout, "----------%30s start\n", __FUNCTION__);
1403 
1404 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
1405  ne10_int32_t vec_size;
1406  ne10_int32_t pos;
1407  const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
1408 
1409  /* init src memory */
1410  NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1411 
1412  /* init dst memory */
1413  NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1414  NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1415 
1416  for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1417  {
1418  for (loop = 0; loop < TEST_ITERATION; loop++)
1419  {
1420  vec_size = func_loop + 1;
1421 
1422  GUARD_ARRAY (thedst_c, loop * vec_size);
1423  GUARD_ARRAY (thedst_neon, loop * vec_size);
1424 
1425  ftbl_3args[2 * func_loop] (thedst_c, thesrc1, loop);
1426  ftbl_3args[2 * func_loop + 1] (thedst_neon, thesrc1, loop);
1427 
1428  assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
1429  assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
1430 
1431  for (pos = 0; pos < loop; pos++)
1432  {
1433 #ifdef DEBUG_TRACE
1434  ne10_int32_t i;
1435  fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1436  for (i = 0; i < vec_size; i++)
1437  {
1438  fprintf (stdout, "thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
1439  }
1440 #endif
1441  assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_LARGE, vec_size);
1442  }
1443  }
1444  }
1445  free (guarded_src1);
1446  free (guarded_dst_c);
1447  free (guarded_dst_neon);
1448 #endif
1449 
1450 #ifdef PERFORMANCE_TEST
1451  fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time (micro-s)", "NEON Time (micro-s)", "Time Savings", "Performance Ratio");
1452  perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
1453  /* init src memory */
1454  NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1455 
1456  /* init dst memory */
1457  NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1458  NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1459 
1460  for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1461  {
1462  GET_TIME (time_c,
1463  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, loop);
1464  );
1465  GET_TIME (time_neon,
1466  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, loop);
1467  );
1468  time_speedup = (ne10_float32_t) time_c / time_neon;
1469  time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1470  ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1471  }
1472 
1473  free (perftest_guarded_src1);
1474  free (perftest_guarded_dst_c);
1475  free (perftest_guarded_dst_neon);
1476 #endif
1477 
1478  fprintf (stdout, "----------%30s end\n", __FUNCTION__);
1479 #undef MAX_VEC_COMPONENTS
1480 }
1481 
1482 void test_rsbc_case0()
1483 {
1484 #define MAX_VEC_COMPONENTS 4
1485  ne10_int32_t loop;
1486  ne10_int32_t func_loop;
1487 
1488  fprintf (stdout, "----------%30s start\n", __FUNCTION__);
1489 
1490  /* init function table */
1491  memset (ftbl_4args, 0, sizeof (ftbl_4args));
1492  memset (ftbl_4args_cst, 0, sizeof (ftbl_4args_cst));
1493  ftbl_4args_cst[ 0] = (ne10_func_4args_cst_t) ne10_rsbc_float_c;
1494  ftbl_4args_cst[ 1] = (ne10_func_4args_cst_t) ne10_rsbc_float_neon;
1495  ftbl_4args[ 2] = (ne10_func_4args_t) ne10_rsbc_vec2f_c;
1496  ftbl_4args[ 3] = (ne10_func_4args_t) ne10_rsbc_vec2f_neon;
1497  ftbl_4args[ 4] = (ne10_func_4args_t) ne10_rsbc_vec3f_c;
1498  ftbl_4args[ 5] = (ne10_func_4args_t) ne10_rsbc_vec3f_neon;
1499  ftbl_4args[ 6] = (ne10_func_4args_t) ne10_rsbc_vec4f_c;
1500  ftbl_4args[ 7] = (ne10_func_4args_t) ne10_rsbc_vec4f_neon;
1501 
1502 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
1503  ne10_int32_t vec_size;
1504  ne10_int32_t pos;
1505  const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
1506 
1507  /* init src memory */
1508  NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1509  NE10_SRC_ALLOC_LIMIT (thecst, guarded_cst, MAX_VEC_COMPONENTS); // 16 extra bytes at the begining and 16 extra bytes at the end
1510 
1511  /* init dst memory */
1512  NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1513  NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1514 
1515  for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1516  {
1517  for (loop = 0; loop < TEST_ITERATION; loop++)
1518  {
1519  vec_size = func_loop + 1;
1520 
1521  GUARD_ARRAY (thedst_c, loop * vec_size);
1522  GUARD_ARRAY (thedst_neon, loop * vec_size);
1523 
1524  if (func_loop == 0)
1525  {
1526  ftbl_4args_cst[2 * func_loop] (thedst_c, thesrc1, thecst[0], loop);
1527  ftbl_4args_cst[2 * func_loop + 1] (thedst_neon, thesrc1, thecst[0], loop);
1528  }
1529  else
1530  {
1531  ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thecst, loop);
1532  ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thecst, loop);
1533  }
1534 
1535  assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
1536  assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
1537 
1538  for (pos = 0; pos < loop; pos++)
1539  {
1540 #ifdef DEBUG_TRACE
1541  ne10_int32_t i;
1542  fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1543  for (i = 0; i < vec_size; i++)
1544  {
1545  fprintf (stdout, "thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
1546  fprintf (stdout, "thecst->%d: %e [0x%04X] \n", i, thecst[i], * (ne10_uint32_t*) &thecst[i]);
1547  }
1548 #endif
1549  assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
1550  }
1551  }
1552  }
1553  free (guarded_src1);
1554  free (guarded_cst);
1555  free (guarded_dst_c);
1556  free (guarded_dst_neon);
1557 #endif
1558 
1559 #ifdef PERFORMANCE_TEST
1560  fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time (micro-s)", "NEON Time (micro-s)", "Time Savings", "Performance Ratio");
1561  perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
1562  /* init src memory */
1563  NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1564  NE10_SRC_ALLOC_LIMIT (perftest_thecst, perftest_guarded_cst, MAX_VEC_COMPONENTS); // 16 extra bytes at the begining and 16 extra bytes at the end
1565 
1566  /* init dst memory */
1567  NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1568  NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1569 
1570  for (func_loop = 0; func_loop < 1; func_loop++)
1571  {
1572  GET_TIME (time_c,
1573  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args_cst[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst[0], loop);
1574  );
1575  GET_TIME (time_neon,
1576  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args_cst[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst[0], loop);
1577  );
1578  time_speedup = (ne10_float32_t) time_c / time_neon;
1579  time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1580  ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1581  }
1582  for (; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1583  {
1584  GET_TIME (time_c,
1585  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst, loop);
1586  );
1587  GET_TIME (time_neon,
1588  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst, loop);
1589  );
1590  time_speedup = (ne10_float32_t) time_c / time_neon;
1591  time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1592  ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1593  }
1594 
1595  free (perftest_guarded_src1);
1596  free (perftest_guarded_cst);
1597  free (perftest_guarded_dst_c);
1598  free (perftest_guarded_dst_neon);
1599 #endif
1600 
1601  fprintf (stdout, "----------%30s end\n", __FUNCTION__);
1602 #undef MAX_VEC_COMPONENTS
1603 }
1604 
1605 void test_setc_case0()
1606 {
1607 #define MAX_VEC_COMPONENTS 4
1608  ne10_int32_t loop;
1609  ne10_int32_t func_loop;
1610 
1611  fprintf (stdout, "----------%30s start\n", __FUNCTION__);
1612 
1613  /* init function table */
1614  memset (ftbl_3args, 0, sizeof (ftbl_3args));
1615  memset (ftbl_3args_cst, 0, sizeof (ftbl_3args_cst));
1616  ftbl_3args_cst[ 0] = (ne10_func_3args_cst_t) ne10_setc_float_c;
1617  ftbl_3args_cst[ 1] = (ne10_func_3args_cst_t) ne10_setc_float_neon;
1618  ftbl_3args[ 2] = (ne10_func_3args_t) ne10_setc_vec2f_c;
1619  ftbl_3args[ 3] = (ne10_func_3args_t) ne10_setc_vec2f_neon;
1620  ftbl_3args[ 4] = (ne10_func_3args_t) ne10_setc_vec3f_c;
1621  ftbl_3args[ 5] = (ne10_func_3args_t) ne10_setc_vec3f_neon;
1622  ftbl_3args[ 6] = (ne10_func_3args_t) ne10_setc_vec4f_c;
1623  ftbl_3args[ 7] = (ne10_func_3args_t) ne10_setc_vec4f_neon;
1624 
1625 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
1626  ne10_int32_t vec_size;
1627  ne10_int32_t pos;
1628  const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
1629 
1630  /* init src memory */
1631  NE10_SRC_ALLOC_LIMIT (thecst, guarded_cst, MAX_VEC_COMPONENTS); // 16 extra bytes at the begining and 16 extra bytes at the end
1632 
1633  /* init dst memory */
1634  NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1635  NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1636 
1637  for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1638  {
1639  for (loop = 0; loop < TEST_ITERATION; loop++)
1640  {
1641  vec_size = func_loop + 1;
1642 
1643  GUARD_ARRAY (thedst_c, loop * vec_size);
1644  GUARD_ARRAY (thedst_neon, loop * vec_size);
1645 
1646  if (func_loop == 0)
1647  {
1648  ftbl_3args_cst[2 * func_loop] (thedst_c, thecst[0], loop);
1649  ftbl_3args_cst[2 * func_loop + 1] (thedst_neon, thecst[0], loop);
1650  }
1651  else
1652  {
1653  ftbl_3args[2 * func_loop] (thedst_c, thecst, loop);
1654  ftbl_3args[2 * func_loop + 1] (thedst_neon, thecst, loop);
1655  }
1656 
1657  assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
1658  assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
1659 
1660  for (pos = 0; pos < loop; pos++)
1661  {
1662 #ifdef DEBUG_TRACE
1663  ne10_int32_t i;
1664  fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1665  for (i = 0; i < vec_size; i++)
1666  {
1667  fprintf (stdout, "thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
1668  fprintf (stdout, "thecst->%d: %e [0x%04X] \n", i, thecst[i], * (ne10_uint32_t*) &thecst[i]);
1669  }
1670 #endif
1671  assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
1672  }
1673  }
1674  }
1675  free (guarded_cst);
1676  free (guarded_dst_c);
1677  free (guarded_dst_neon);
1678 #endif
1679 
1680 #ifdef PERFORMANCE_TEST
1681  fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time (micro-s)", "NEON Time (micro-s)", "Time Savings", "Performance Ratio");
1682  perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
1683  /* init src memory */
1684  NE10_SRC_ALLOC_LIMIT (perftest_thecst, perftest_guarded_cst, MAX_VEC_COMPONENTS); // 16 extra bytes at the begining and 16 extra bytes at the end
1685 
1686  /* init dst memory */
1687  NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1688  NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1689 
1690  for (func_loop = 0; func_loop < 1; func_loop++)
1691  {
1692  GET_TIME (time_c,
1693  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args_cst[2 * func_loop] (perftest_thedst_c, perftest_thecst[0], loop);
1694  );
1695  GET_TIME (time_neon,
1696  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args_cst[2 * func_loop + 1] (perftest_thedst_neon, perftest_thecst[0], loop);
1697  );
1698  time_speedup = (ne10_float32_t) time_c / time_neon;
1699  time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1700  ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1701  }
1702  for (; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1703  {
1704  GET_TIME (time_c,
1705  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop] (perftest_thedst_c, perftest_thecst, loop);
1706  );
1707  GET_TIME (time_neon,
1708  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thecst, loop);
1709  );
1710  time_speedup = (ne10_float32_t) time_c / time_neon;
1711  time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1712  ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1713  }
1714 
1715  free (perftest_guarded_cst);
1716  free (perftest_guarded_dst_c);
1717  free (perftest_guarded_dst_neon);
1718 #endif
1719 
1720  fprintf (stdout, "----------%30s end\n", __FUNCTION__);
1721 #undef MAX_VEC_COMPONENTS
1722 }
1723 
1724 void test_subc_case0()
1725 {
1726 #define MAX_VEC_COMPONENTS 4
1727  ne10_int32_t loop;
1728  ne10_int32_t func_loop;
1729 
1730  fprintf (stdout, "----------%30s start\n", __FUNCTION__);
1731 
1732  /* init function table */
1733  memset (ftbl_4args, 0, sizeof (ftbl_4args));
1734  memset (ftbl_4args_cst, 0, sizeof (ftbl_4args_cst));
1735  ftbl_4args_cst[ 0] = (ne10_func_4args_cst_t) ne10_subc_float_c;
1736  ftbl_4args_cst[ 1] = (ne10_func_4args_cst_t) ne10_subc_float_neon;
1737  ftbl_4args[ 2] = (ne10_func_4args_t) ne10_subc_vec2f_c;
1738  ftbl_4args[ 3] = (ne10_func_4args_t) ne10_subc_vec2f_neon;
1739  ftbl_4args[ 4] = (ne10_func_4args_t) ne10_subc_vec3f_c;
1740  ftbl_4args[ 5] = (ne10_func_4args_t) ne10_subc_vec3f_neon;
1741  ftbl_4args[ 6] = (ne10_func_4args_t) ne10_subc_vec4f_c;
1742  ftbl_4args[ 7] = (ne10_func_4args_t) ne10_subc_vec4f_neon;
1743 
1744 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
1745  ne10_int32_t vec_size;
1746  ne10_int32_t pos;
1747  const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
1748 
1749  /* init src memory */
1750  NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1751  NE10_SRC_ALLOC_LIMIT (thecst, guarded_cst, MAX_VEC_COMPONENTS); // 16 extra bytes at the begining and 16 extra bytes at the end
1752 
1753  /* init dst memory */
1754  NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1755  NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1756 
1757  for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1758  {
1759  for (loop = 0; loop < TEST_ITERATION; loop++)
1760  {
1761  vec_size = func_loop + 1;
1762 
1763  GUARD_ARRAY (thedst_c, loop * vec_size);
1764  GUARD_ARRAY (thedst_neon, loop * vec_size);
1765 
1766  if (func_loop == 0)
1767  {
1768  ftbl_4args_cst[2 * func_loop] (thedst_c, thesrc1, thecst[0], loop);
1769  ftbl_4args_cst[2 * func_loop + 1] (thedst_neon, thesrc1, thecst[0], loop);
1770  }
1771  else
1772  {
1773  ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thecst, loop);
1774  ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thecst, loop);
1775  }
1776 
1777  assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
1778  assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
1779 
1780  for (pos = 0; pos < loop; pos++)
1781  {
1782 #ifdef DEBUG_TRACE
1783  ne10_int32_t i;
1784  fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1785  for (i = 0; i < vec_size; i++)
1786  {
1787  fprintf (stdout, "thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
1788  fprintf (stdout, "thecst->%d: %e [0x%04X] \n", i, thecst[i], * (ne10_uint32_t*) &thecst[i]);
1789  }
1790 #endif
1791  assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
1792  }
1793  }
1794  }
1795  free (guarded_src1);
1796  free (guarded_cst);
1797  free (guarded_dst_c);
1798  free (guarded_dst_neon);
1799 #endif
1800 
1801 #ifdef PERFORMANCE_TEST
1802  fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time (micro-s)", "NEON Time (micro-s)", "Time Savings", "Performance Ratio");
1803  perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
1804  /* init src memory */
1805  NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1806  NE10_SRC_ALLOC_LIMIT (perftest_thecst, perftest_guarded_cst, MAX_VEC_COMPONENTS); // 16 extra bytes at the begining and 16 extra bytes at the end
1807 
1808  /* init dst memory */
1809  NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1810  NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1811 
1812  for (func_loop = 0; func_loop < 1; func_loop++)
1813  {
1814  GET_TIME (time_c,
1815  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args_cst[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst[0], loop);
1816  );
1817  GET_TIME (time_neon,
1818  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args_cst[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst[0], loop);
1819  );
1820  time_speedup = (ne10_float32_t) time_c / time_neon;
1821  time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1822  ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1823  }
1824  for (; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1825  {
1826  GET_TIME (time_c,
1827  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst, loop);
1828  );
1829  GET_TIME (time_neon,
1830  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst, loop);
1831  );
1832  time_speedup = (ne10_float32_t) time_c / time_neon;
1833  time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1834  ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1835  }
1836 
1837  free (perftest_guarded_src1);
1838  free (perftest_guarded_cst);
1839  free (perftest_guarded_dst_c);
1840  free (perftest_guarded_dst_neon);
1841 #endif
1842 
1843  fprintf (stdout, "----------%30s end\n", __FUNCTION__);
1844 #undef MAX_VEC_COMPONENTS
1845 }
1846 
1847 void test_sub_case0()
1848 {
1849 #define MAX_VEC_COMPONENTS 4
1850  ne10_int32_t loop;
1851  ne10_int32_t func_loop;
1852 
1853  fprintf (stdout, "----------%30s start\n", __FUNCTION__);
1854 
1855  /* init function table */
1856  memset (ftbl_4args, 0, sizeof (ftbl_4args));
1857  ftbl_4args[ 0] = (ne10_func_4args_t) ne10_sub_float_c;
1858  ftbl_4args[ 1] = (ne10_func_4args_t) ne10_sub_float_neon;
1859  ftbl_4args[ 2] = (ne10_func_4args_t) ne10_sub_vec2f_c;
1860  ftbl_4args[ 3] = (ne10_func_4args_t) ne10_sub_vec2f_neon;
1861  ftbl_4args[ 4] = (ne10_func_4args_t) ne10_sub_vec3f_c;
1862  ftbl_4args[ 5] = (ne10_func_4args_t) ne10_sub_vec3f_neon;
1863  ftbl_4args[ 6] = (ne10_func_4args_t) ne10_sub_vec4f_c;
1864  ftbl_4args[ 7] = (ne10_func_4args_t) ne10_sub_vec4f_neon;
1865 
1866 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
1867  ne10_int32_t vec_size;
1868  ne10_int32_t pos;
1869  const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
1870 
1871  /* init src memory */
1872  NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1873  NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1874 
1875  /* init dst memory */
1876  NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1877  NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1878 
1879  for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1880  {
1881  for (loop = 0; loop < TEST_ITERATION; loop++)
1882  {
1883  vec_size = func_loop + 1;
1884 
1885  GUARD_ARRAY (thedst_c, loop * vec_size);
1886  GUARD_ARRAY (thedst_neon, loop * vec_size);
1887 
1888  ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
1889  ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
1890 
1891  assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
1892  assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
1893 
1894  for (pos = 0; pos < loop; pos++)
1895  {
1896 #ifdef DEBUG_TRACE
1897  ne10_int32_t i;
1898  fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1899  for (i = 0; i < vec_size; i++)
1900  {
1901  fprintf (stdout, "thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
1902  fprintf (stdout, "thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
1903  }
1904 #endif
1905  assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
1906  }
1907  }
1908  }
1909  free (guarded_src1);
1910  free (guarded_src2);
1911  free (guarded_dst_c);
1912  free (guarded_dst_neon);
1913 #endif
1914 
1915 #ifdef PERFORMANCE_TEST
1916  fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time (micro-s)", "NEON Time (micro-s)", "Time Savings", "Performance Ratio");
1917  perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
1918  /* init src memory */
1919  NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1920  NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1921 
1922  /* init dst memory */
1923  NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1924  NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1925 
1926  for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1927  {
1928  GET_TIME (time_c,
1929  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
1930  );
1931  GET_TIME (time_neon,
1932  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
1933  );
1934  time_speedup = (ne10_float32_t) time_c / time_neon;
1935  time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1936  ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1937  }
1938 
1939  free (perftest_guarded_src1);
1940  free (perftest_guarded_src2);
1941  free (perftest_guarded_dst_c);
1942  free (perftest_guarded_dst_neon);
1943 #endif
1944 
1945  fprintf (stdout, "----------%30s end\n", __FUNCTION__);
1946 #undef MAX_VEC_COMPONENTS
1947 }
1948 
1949 void test_addmat_case0()
1950 {
1951 #define MAX_VEC_COMPONENTS 4
1952  ne10_int32_t loop;
1953  ne10_int32_t func_loop;
1954 
1955  fprintf (stdout, "----------%30s start\n", __FUNCTION__);
1956 
1957  /* init function table */
1958  memset (ftbl_4args, 0, sizeof (ftbl_4args));
1959  ftbl_4args[ 2] = (ne10_func_4args_t) ne10_addmat_2x2f_c;
1960  ftbl_4args[ 3] = (ne10_func_4args_t) ne10_addmat_2x2f_neon;
1961  ftbl_4args[ 4] = (ne10_func_4args_t) ne10_addmat_3x3f_c;
1962  ftbl_4args[ 5] = (ne10_func_4args_t) ne10_addmat_3x3f_neon;
1963  ftbl_4args[ 6] = (ne10_func_4args_t) ne10_addmat_4x4f_c;
1964  ftbl_4args[ 7] = (ne10_func_4args_t) ne10_addmat_4x4f_neon;
1965 
1966 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
1967  ne10_int32_t vec_size;
1968  ne10_int32_t pos;
1969  const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
1970 
1971  /* init src memory */
1972  NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1973  NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1974 
1975  /* init dst memory */
1976  NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1977  NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1978 
1979  for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1980  {
1981  for (loop = 0; loop < TEST_ITERATION; loop++)
1982  {
1983  vec_size = (func_loop + 1) * (func_loop + 1);
1984 
1985  GUARD_ARRAY (thedst_c, loop * vec_size);
1986  GUARD_ARRAY (thedst_neon, loop * vec_size);
1987 
1988  ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
1989  ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
1990 
1991  assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
1992  assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
1993 
1994  for (pos = 0; pos < loop; pos++)
1995  {
1996 #ifdef DEBUG_TRACE
1997  ne10_int32_t i;
1998  fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1999  for (i = 0; i < vec_size; i++)
2000  {
2001  fprintf (stdout, "thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
2002  fprintf (stdout, "thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
2003  }
2004 #endif
2005  assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
2006  }
2007  }
2008  }
2009  free (guarded_src1);
2010  free (guarded_src2);
2011  free (guarded_dst_c);
2012  free (guarded_dst_neon);
2013 #endif
2014 
2015 #ifdef PERFORMANCE_TEST
2016  fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time (micro-s)", "NEON Time (micro-s)", "Time Savings", "Performance Ratio");
2017  perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2018  /* init src memory */
2019  NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2020  NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2021 
2022  /* init dst memory */
2023  NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
2024  NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
2025 
2026  for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2027  {
2028  GET_TIME (time_c,
2029  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
2030  );
2031  GET_TIME (time_neon,
2032  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
2033  );
2034  time_speedup = (ne10_float32_t) time_c / time_neon;
2035  time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
2036  ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
2037  }
2038 
2039  free (perftest_guarded_src1);
2040  free (perftest_guarded_src2);
2041  free (perftest_guarded_dst_c);
2042  free (perftest_guarded_dst_neon);
2043 #endif
2044 
2045  fprintf (stdout, "----------%30s end\n", __FUNCTION__);
2046 #undef MAX_VEC_COMPONENTS
2047 }
2048 
2049 void test_detmat_case0()
2050 {
2051 #define MAX_VEC_COMPONENTS 4
2052  ne10_int32_t loop;
2053  ne10_int32_t func_loop;
2054 
2055  fprintf (stdout, "----------%30s start\n", __FUNCTION__);
2056 
2057  /* init function table */
2058  memset (ftbl_3args, 0, sizeof (ftbl_3args));
2059  ftbl_3args[ 2] = (ne10_func_3args_t) ne10_detmat_2x2f_c;
2060  ftbl_3args[ 3] = (ne10_func_3args_t) ne10_detmat_2x2f_neon;
2061  ftbl_3args[ 4] = (ne10_func_3args_t) ne10_detmat_3x3f_c;
2062  ftbl_3args[ 5] = (ne10_func_3args_t) ne10_detmat_3x3f_neon;
2063  ftbl_3args[ 6] = (ne10_func_3args_t) ne10_detmat_4x4f_c;
2064  ftbl_3args[ 7] = (ne10_func_3args_t) ne10_detmat_4x4f_neon;
2065 
2066 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
2067  ne10_int32_t vec_size;
2068  ne10_int32_t pos;
2069  const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2070 
2071  /* init src memory */
2072  NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2073 
2074  /* init dst memory */
2075  NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
2076  NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
2077 
2078  for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2079  {
2080  for (loop = 0; loop < TEST_ITERATION; loop++)
2081  {
2082  vec_size = (func_loop + 1) * (func_loop + 1);
2083 
2084  GUARD_ARRAY (thedst_c, loop);
2085  GUARD_ARRAY (thedst_neon, loop);
2086 
2087  ftbl_3args[2 * func_loop] (thedst_c, thesrc1, loop);
2088  ftbl_3args[2 * func_loop + 1] (thedst_neon, thesrc1, loop);
2089 
2090  assert_true (CHECK_ARRAY_GUARD (thedst_c, loop));
2091  assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop));
2092 
2093  for (pos = 0; pos < loop; pos++)
2094  {
2095 #ifdef DEBUG_TRACE
2096  ne10_int32_t i;
2097  fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
2098  for (i = 0; i < vec_size; i++)
2099  {
2100  fprintf (stdout, "thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
2101  }
2102 #endif
2103  assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, 1);
2104  }
2105  }
2106  }
2107  free (guarded_src1);
2108  free (guarded_dst_c);
2109  free (guarded_dst_neon);
2110 #endif
2111 
2112 #ifdef PERFORMANCE_TEST
2113  fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time (micro-s)", "NEON Time (micro-s)", "Time Savings", "Performance Ratio");
2114  perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2115  /* init src memory */
2116  NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2117 
2118  /* init dst memory */
2119  NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
2120  NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
2121 
2122  for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2123  {
2124  GET_TIME (time_c,
2125  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, loop);
2126  );
2127  GET_TIME (time_neon,
2128  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, loop);
2129  );
2130  time_speedup = (ne10_float32_t) time_c / time_neon;
2131  time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
2132  ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
2133  }
2134 
2135  free (perftest_guarded_src1);
2136  free (perftest_guarded_dst_c);
2137  free (perftest_guarded_dst_neon);
2138 #endif
2139 
2140  fprintf (stdout, "----------%30s end\n", __FUNCTION__);
2141 #undef MAX_VEC_COMPONENTS
2142 }
2143 
2144 void test_identitymat_case0()
2145 {
2146 #define MAX_VEC_COMPONENTS 4
2147  ne10_int32_t loop;
2148  ne10_int32_t func_loop;
2149 
2150  fprintf (stdout, "----------%30s start\n", __FUNCTION__);
2151 
2152  /* init function table */
2153  memset (ftbl_2args, 0, sizeof (ftbl_2args));
2154  ftbl_2args[ 2] = (ne10_func_2args_t) ne10_identitymat_2x2f_c;
2155  ftbl_2args[ 3] = (ne10_func_2args_t) ne10_identitymat_2x2f_neon;
2156  ftbl_2args[ 4] = (ne10_func_2args_t) ne10_identitymat_3x3f_c;
2157  ftbl_2args[ 5] = (ne10_func_2args_t) ne10_identitymat_3x3f_neon;
2158  ftbl_2args[ 6] = (ne10_func_2args_t) ne10_identitymat_4x4f_c;
2159  ftbl_2args[ 7] = (ne10_func_2args_t) ne10_identitymat_4x4f_neon;
2160 
2161 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
2162  ne10_int32_t vec_size;
2163  ne10_int32_t pos;
2164  const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2165 
2166  /* init dst memory */
2167  NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
2168  NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
2169 
2170  for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2171  {
2172  for (loop = 0; loop < TEST_ITERATION; loop++)
2173  {
2174  vec_size = (func_loop + 1) * (func_loop + 1);
2175 
2176  GUARD_ARRAY (thedst_c, loop * vec_size);
2177  GUARD_ARRAY (thedst_neon, loop * vec_size);
2178 
2179  ftbl_2args[2 * func_loop] (thedst_c, loop);
2180  ftbl_2args[2 * func_loop + 1] (thedst_neon, loop);
2181 
2182  assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
2183  assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
2184 
2185  for (pos = 0; pos < loop; pos++)
2186  {
2187 #ifdef DEBUG_TRACE
2188  fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
2189 #endif
2190  assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
2191  }
2192  }
2193  }
2194  free (guarded_dst_c);
2195  free (guarded_dst_neon);
2196 #endif
2197 
2198 #ifdef PERFORMANCE_TEST
2199  fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time (micro-s)", "NEON Time (micro-s)", "Time Savings", "Performance Ratio");
2200  perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2201  /* init dst memory */
2202  NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
2203  NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
2204 
2205  for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2206  {
2207  GET_TIME (time_c,
2208  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_2args[2 * func_loop] (perftest_thedst_c, loop);
2209  );
2210  GET_TIME (time_neon,
2211  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_2args[2 * func_loop + 1] (perftest_thedst_neon, loop);
2212  );
2213  time_speedup = (ne10_float32_t) time_c / time_neon;
2214  time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
2215  ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
2216  }
2217 
2218  free (perftest_guarded_dst_c);
2219  free (perftest_guarded_dst_neon);
2220 #endif
2221 
2222  fprintf (stdout, "----------%30s end\n", __FUNCTION__);
2223 #undef MAX_VEC_COMPONENTS
2224 }
2225 
2226 void test_invmat_case0()
2227 {
2228 #define MAX_VEC_COMPONENTS 4
2229  ne10_int32_t loop;
2230  ne10_int32_t func_loop;
2231 
2232  fprintf (stdout, "----------%30s start\n", __FUNCTION__);
2233 
2234  /* init function table */
2235  memset (ftbl_3args, 0, sizeof (ftbl_3args));
2236  ftbl_3args[ 2] = (ne10_func_3args_t) ne10_invmat_2x2f_c;
2237  ftbl_3args[ 3] = (ne10_func_3args_t) ne10_invmat_2x2f_neon;
2238  ftbl_3args[ 4] = (ne10_func_3args_t) ne10_invmat_3x3f_c;
2239  ftbl_3args[ 5] = (ne10_func_3args_t) ne10_invmat_3x3f_neon;
2240  ftbl_3args[ 6] = (ne10_func_3args_t) ne10_invmat_4x4f_c;
2241  ftbl_3args[ 7] = (ne10_func_3args_t) ne10_invmat_4x4f_neon;
2242 
2243 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
2244  ne10_int32_t vec_size;
2245  ne10_int32_t pos;
2246  const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2247 
2248  /* init src memory */
2249  NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2250 
2251  /* init dst memory */
2252  NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
2253  NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
2254 
2255  for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2256  {
2257  for (loop = 0; loop < TEST_ITERATION; loop++)
2258  {
2259  vec_size = (func_loop + 1) * (func_loop + 1);
2260 
2261  GUARD_ARRAY (thedst_c, loop * vec_size);
2262  GUARD_ARRAY (thedst_neon, loop * vec_size);
2263 
2264  ftbl_3args[2 * func_loop] (thedst_c, thesrc1, loop);
2265  ftbl_3args[2 * func_loop + 1] (thedst_neon, thesrc1, loop);
2266 
2267  assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
2268  assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
2269 
2270  for (pos = 0; pos < loop; pos++)
2271  {
2272 #ifdef DEBUG_TRACE
2273  ne10_int32_t i;
2274  fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
2275  for (i = 0; i < vec_size; i++)
2276  {
2277  fprintf (stdout, "thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
2278  }
2279 #endif
2280  assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_LARGE, vec_size);
2281  }
2282  }
2283  }
2284  free (guarded_src1);
2285  free (guarded_dst_c);
2286  free (guarded_dst_neon);
2287 #endif
2288 
2289 #ifdef PERFORMANCE_TEST
2290  fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time (micro-s)", "NEON Time (micro-s)", "Time Savings", "Performance Ratio");
2291  perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2292  /* init src memory */
2293  NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2294 
2295  /* init dst memory */
2296  NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
2297  NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
2298 
2299  for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2300  {
2301  GET_TIME (time_c,
2302  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, loop);
2303  );
2304  GET_TIME (time_neon,
2305  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, loop);
2306  );
2307  time_speedup = (ne10_float32_t) time_c / time_neon;
2308  time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
2309  ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
2310  }
2311 
2312  free (perftest_guarded_src1);
2313  free (perftest_guarded_dst_c);
2314  free (perftest_guarded_dst_neon);
2315 #endif
2316 
2317  fprintf (stdout, "----------%30s end\n", __FUNCTION__);
2318 #undef MAX_VEC_COMPONENTS
2319 }
2320 
2321 void test_mulmat_case0()
2322 {
2323 #define MAX_VEC_COMPONENTS 4
2324  ne10_int32_t loop;
2325  ne10_int32_t func_loop;
2326 
2327  fprintf (stdout, "----------%30s start\n", __FUNCTION__);
2328 
2329  /* init function table */
2330  memset (ftbl_4args, 0, sizeof (ftbl_4args));
2331  ftbl_4args[ 2] = (ne10_func_4args_t) ne10_mulmat_2x2f_c;
2332  ftbl_4args[ 3] = (ne10_func_4args_t) ne10_mulmat_2x2f_neon;
2333  ftbl_4args[ 4] = (ne10_func_4args_t) ne10_mulmat_3x3f_c;
2334  ftbl_4args[ 5] = (ne10_func_4args_t) ne10_mulmat_3x3f_neon;
2335  ftbl_4args[ 6] = (ne10_func_4args_t) ne10_mulmat_4x4f_c;
2336  ftbl_4args[ 7] = (ne10_func_4args_t) ne10_mulmat_4x4f_neon;
2337 
2338 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
2339  ne10_int32_t vec_size;
2340  ne10_int32_t pos;
2341  const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2342 
2343  /* init src memory */
2344  NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2345  NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2346 
2347  /* init dst memory */
2348  NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
2349  NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
2350 
2351  for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2352  {
2353  for (loop = 0; loop < TEST_ITERATION; loop++)
2354  {
2355  vec_size = (func_loop + 1) * (func_loop + 1);
2356 
2357  GUARD_ARRAY (thedst_c, loop * vec_size);
2358  GUARD_ARRAY (thedst_neon, loop * vec_size);
2359 
2360  ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
2361  ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
2362 
2363  assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
2364  assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
2365 
2366  for (pos = 0; pos < loop; pos++)
2367  {
2368 #ifdef DEBUG_TRACE
2369  ne10_int32_t i;
2370  fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
2371  for (i = 0; i < vec_size; i++)
2372  {
2373  fprintf (stdout, "thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
2374  fprintf (stdout, "thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
2375  }
2376 #endif
2377  assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
2378  }
2379  }
2380  }
2381  free (guarded_src1);
2382  free (guarded_src2);
2383  free (guarded_dst_c);
2384  free (guarded_dst_neon);
2385 #endif
2386 
2387 #ifdef PERFORMANCE_TEST
2388  fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time (micro-s)", "NEON Time (micro-s)", "Time Savings", "Performance Ratio");
2389  perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2390  /* init src memory */
2391  NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2392  NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2393 
2394  /* init dst memory */
2395  NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
2396  NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
2397 
2398  for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2399  {
2400  GET_TIME (time_c,
2401  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
2402  );
2403  GET_TIME (time_neon,
2404  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
2405  );
2406  time_speedup = (ne10_float32_t) time_c / time_neon;
2407  time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
2408  ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
2409  }
2410 
2411  free (perftest_guarded_src1);
2412  free (perftest_guarded_src2);
2413  free (perftest_guarded_dst_c);
2414  free (perftest_guarded_dst_neon);
2415 #endif
2416 
2417  fprintf (stdout, "----------%30s end\n", __FUNCTION__);
2418 #undef MAX_VEC_COMPONENTS
2419 }
2420 
2421 void test_submat_case0()
2422 {
2423 #define MAX_VEC_COMPONENTS 4
2424  ne10_int32_t loop;
2425  ne10_int32_t func_loop;
2426 
2427  fprintf (stdout, "----------%30s start\n", __FUNCTION__);
2428 
2429  /* init function table */
2430  memset (ftbl_4args, 0, sizeof (ftbl_4args));
2431  ftbl_4args[ 2] = (ne10_func_4args_t) ne10_submat_2x2f_c;
2432  ftbl_4args[ 3] = (ne10_func_4args_t) ne10_submat_2x2f_neon;
2433  ftbl_4args[ 4] = (ne10_func_4args_t) ne10_submat_3x3f_c;
2434  ftbl_4args[ 5] = (ne10_func_4args_t) ne10_submat_3x3f_neon;
2435  ftbl_4args[ 6] = (ne10_func_4args_t) ne10_submat_4x4f_c;
2436  ftbl_4args[ 7] = (ne10_func_4args_t) ne10_submat_4x4f_neon;
2437 
2438 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
2439  ne10_int32_t vec_size;
2440  ne10_int32_t pos;
2441  const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2442 
2443  /* init src memory */
2444  NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2445  NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2446 
2447  /* init dst memory */
2448  NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
2449  NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
2450 
2451  for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2452  {
2453  for (loop = 0; loop < TEST_ITERATION; loop++)
2454  {
2455  vec_size = (func_loop + 1) * (func_loop + 1);
2456 
2457  GUARD_ARRAY (thedst_c, loop * vec_size);
2458  GUARD_ARRAY (thedst_neon, loop * vec_size);
2459 
2460  ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
2461  ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
2462 
2463  assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
2464  assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
2465 
2466  for (pos = 0; pos < loop; pos++)
2467  {
2468 #ifdef DEBUG_TRACE
2469  ne10_int32_t i;
2470  fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
2471  for (i = 0; i < vec_size; i++)
2472  {
2473  fprintf (stdout, "thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
2474  fprintf (stdout, "thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
2475  }
2476 #endif
2477  assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
2478  }
2479  }
2480  }
2481  free (guarded_src1);
2482  free (guarded_src2);
2483  free (guarded_dst_c);
2484  free (guarded_dst_neon);
2485 #endif
2486 
2487 #ifdef PERFORMANCE_TEST
2488  fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time (micro-s)", "NEON Time (micro-s)", "Time Savings", "Performance Ratio");
2489  perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2490  /* init src memory */
2491  NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2492  NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2493 
2494  /* init dst memory */
2495  NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
2496  NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
2497 
2498  for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2499  {
2500  GET_TIME (time_c,
2501  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
2502  );
2503  GET_TIME (time_neon,
2504  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
2505  );
2506  time_speedup = (ne10_float32_t) time_c / time_neon;
2507  time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
2508  ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
2509  }
2510 
2511  free (perftest_guarded_src1);
2512  free (perftest_guarded_src2);
2513  free (perftest_guarded_dst_c);
2514  free (perftest_guarded_dst_neon);
2515 #endif
2516 
2517  fprintf (stdout, "----------%30s end\n", __FUNCTION__);
2518 #undef MAX_VEC_COMPONENTS
2519 }
2520 
2521 void test_transmat_case0()
2522 {
2523 #define MAX_VEC_COMPONENTS 4
2524  ne10_int32_t loop;
2525  ne10_int32_t func_loop;
2526 
2527  fprintf (stdout, "----------%30s start\n", __FUNCTION__);
2528 
2529  /* init function table */
2530  memset (ftbl_3args, 0, sizeof (ftbl_3args));
2531  ftbl_3args[ 2] = (ne10_func_3args_t) ne10_transmat_2x2f_c;
2532  ftbl_3args[ 3] = (ne10_func_3args_t) ne10_transmat_2x2f_neon;
2533  ftbl_3args[ 4] = (ne10_func_3args_t) ne10_transmat_3x3f_c;
2534  ftbl_3args[ 5] = (ne10_func_3args_t) ne10_transmat_3x3f_neon;
2535  ftbl_3args[ 6] = (ne10_func_3args_t) ne10_transmat_4x4f_c;
2536  ftbl_3args[ 7] = (ne10_func_3args_t) ne10_transmat_4x4f_neon;
2537 
2538 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
2539  ne10_int32_t vec_size;
2540  ne10_int32_t pos;
2541  const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2542 
2543  /* init src memory */
2544  NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2545 
2546  /* init dst memory */
2547  NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
2548  NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
2549 
2550  for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2551  {
2552  for (loop = 0; loop < TEST_ITERATION; loop++)
2553  {
2554  vec_size = (func_loop + 1) * (func_loop + 1);
2555 
2556  GUARD_ARRAY (thedst_c, loop * vec_size);
2557  GUARD_ARRAY (thedst_neon, loop * vec_size);
2558 
2559  ftbl_3args[2 * func_loop] (thedst_c, thesrc1, loop);
2560  ftbl_3args[2 * func_loop + 1] (thedst_neon, thesrc1, loop);
2561 
2562  assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
2563  assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
2564 
2565  for (pos = 0; pos < loop; pos++)
2566  {
2567 #ifdef DEBUG_TRACE
2568  ne10_int32_t i;
2569  fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
2570  for (i = 0; i < vec_size; i++)
2571  {
2572  fprintf (stdout, "thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
2573  }
2574 #endif
2575  assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
2576  }
2577  }
2578  }
2579  free (guarded_src1);
2580  free (guarded_dst_c);
2581  free (guarded_dst_neon);
2582 #endif
2583 
2584 #ifdef PERFORMANCE_TEST
2585  fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time (micro-s)", "NEON Time (micro-s)", "Time Savings", "Performance Ratio");
2586  perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2587  /* init src memory */
2588  NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2589 
2590  /* init dst memory */
2591  NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
2592  NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
2593 
2594  for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2595  {
2596  GET_TIME (time_c,
2597  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, loop);
2598  );
2599  GET_TIME (time_neon,
2600  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, loop);
2601  );
2602  time_speedup = (ne10_float32_t) time_c / time_neon;
2603  time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
2604  ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
2605  }
2606 
2607  free (perftest_guarded_src1);
2608  free (perftest_guarded_dst_c);
2609  free (perftest_guarded_dst_neon);
2610 #endif
2611 
2612  fprintf (stdout, "----------%30s end\n", __FUNCTION__);
2613 #undef MAX_VEC_COMPONENTS
2614 }
2615 
2616 void test_mulcmatvec_case0()
2617 {
2618 #define MAX_VEC_COMPONENTS 4
2619  ne10_int32_t loop;
2620  ne10_int32_t func_loop;
2621 
2622  fprintf (stdout, "----------%30s start\n", __FUNCTION__);
2623 
2624  /* init function table */
2625  memset (ftbl_4args, 0, sizeof (ftbl_4args));
2626  ftbl_4args[ 2] = (ne10_func_4args_t) ne10_mulcmatvec_cm2x2f_v2f_c;
2627  ftbl_4args[ 3] = (ne10_func_4args_t) ne10_mulcmatvec_cm2x2f_v2f_neon;
2628  ftbl_4args[ 4] = (ne10_func_4args_t) ne10_mulcmatvec_cm3x3f_v3f_c;
2629  ftbl_4args[ 5] = (ne10_func_4args_t) ne10_mulcmatvec_cm3x3f_v3f_neon;
2630  ftbl_4args[ 6] = (ne10_func_4args_t) ne10_mulcmatvec_cm4x4f_v4f_c;
2631  ftbl_4args[ 7] = (ne10_func_4args_t) ne10_mulcmatvec_cm4x4f_v4f_neon;
2632 
2633 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
2634  ne10_int32_t vec_size;
2635  ne10_int32_t pos;
2636  const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
2637 
2638  /* init src memory */
2639  NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2640  NE10_SRC_ALLOC_LIMIT (thecst, guarded_cst, MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS); // 16 extra bytes at the begining and 16 extra bytes at the end
2641 
2642  /* init dst memory */
2643  NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
2644  NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
2645 
2646  for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2647  {
2648  for (loop = 0; loop < TEST_ITERATION; loop++)
2649  {
2650  vec_size = func_loop + 1;
2651 
2652  GUARD_ARRAY (thedst_c, loop * vec_size);
2653  GUARD_ARRAY (thedst_neon, loop * vec_size);
2654 
2655  ftbl_4args[2 * func_loop] (thedst_c, thecst, thesrc1, loop);
2656  ftbl_4args[2 * func_loop + 1] (thedst_neon, thecst, thesrc1, loop);
2657 
2658  assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
2659  assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
2660 
2661  for (pos = 0; pos < loop; pos++)
2662  {
2663 #ifdef DEBUG_TRACE
2664  ne10_int32_t i;
2665  fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
2666  for (i = 0; i < vec_size * vec_size; i++)
2667  {
2668  fprintf (stdout, "thecst->%d: %e [0x%04X] \n", i, thecst[i], * (ne10_uint32_t*) &thecst[i]);
2669  }
2670  for (i = 0; i < vec_size; i++)
2671  {
2672  fprintf (stdout, "thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
2673  }
2674 #endif
2675  assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
2676  }
2677  }
2678  }
2679  free (guarded_src1);
2680  free (guarded_cst);
2681  free (guarded_dst_c);
2682  free (guarded_dst_neon);
2683 #endif
2684 
2685 #ifdef PERFORMANCE_TEST
2686  fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time (micro-s)", "NEON Time (micro-s)", "Time Savings", "Performance Ratio");
2687  perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
2688  /* init src memory */
2689  NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2690  NE10_SRC_ALLOC_LIMIT (perftest_thecst, perftest_guarded_cst, MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS); // 16 extra bytes at the begining and 16 extra bytes at the end
2691 
2692  /* init dst memory */
2693  NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
2694  NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
2695 
2696  for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2697  {
2698  GET_TIME (time_c,
2699  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thecst, perftest_thesrc1, loop);
2700  );
2701  GET_TIME (time_neon,
2702  for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thecst, perftest_thesrc1, loop);
2703  );
2704  time_speedup = (ne10_float32_t) time_c / time_neon;
2705  time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
2706  ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
2707  }
2708 
2709  free (perftest_guarded_src1);
2710  free (perftest_guarded_cst);
2711  free (perftest_guarded_dst_c);
2712  free (perftest_guarded_dst_neon);
2713 #endif
2714 
2715  fprintf (stdout, "----------%30s end\n", __FUNCTION__);
2716 #undef MAX_VEC_COMPONENTS
2717 }
2718 
2719 void test_abs()
2720 {
2721  test_abs_case0();
2722 }
2723 
2724 void test_addc()
2725 {
2726  test_addc_case0();
2727 }
2728 
2729 void test_add()
2730 {
2731  test_add_case0();
2732 }
2733 
2734 void test_cross()
2735 {
2736  test_cross_case0();
2737 }
2738 
2739 void test_divc()
2740 {
2741  test_divc_case0();
2742 }
2743 
2744 void test_div()
2745 {
2746  test_div_case0();
2747 }
2748 
2749 void test_dot()
2750 {
2751  test_dot_case0();
2752 }
2753 
2754 void test_len()
2755 {
2756  test_len_case0();
2757 }
2758 
2759 void test_mlac()
2760 {
2761  test_mlac_case0();
2762 }
2763 
2764 void test_mla()
2765 {
2766  test_mla_case0();
2767 }
2768 
2769 void test_mulc()
2770 {
2771  test_mulc_case0();
2772 }
2773 
2774 void test_mul()
2775 {
2776  test_mul_case0();
2777 }
2778 void test_normalize()
2779 {
2780  test_normalize_case0();
2781 }
2782 
2783 void test_rsbc()
2784 {
2785  test_rsbc_case0();
2786 }
2787 
2788 void test_setc()
2789 {
2790  test_setc_case0();
2791 }
2792 
2793 void test_subc()
2794 {
2795  test_subc_case0();
2796 }
2797 
2798 void test_sub()
2799 {
2800  test_sub_case0();
2801 }
2802 
2803 void test_addmat()
2804 {
2805  test_addmat_case0();
2806 }
2807 
2808 void test_detmat()
2809 {
2810  test_detmat_case0();
2811 }
2812 
2813 void test_identitymat()
2814 {
2815  test_identitymat_case0();
2816 }
2817 
2818 void test_invmat()
2819 {
2820  test_invmat_case0();
2821 }
2822 
2823 void test_mulmat()
2824 {
2825  test_mulmat_case0();
2826 }
2827 
2828 void test_mulcmatvec()
2829 {
2830  test_mulcmatvec_case0();
2831 }
2832 
2833 void test_submat()
2834 {
2835  test_submat_case0();
2836 }
2837 
2838 void test_transmat()
2839 {
2840  test_transmat_case0();
2841 }
2842 
2843 static void my_test_setup (void)
2844 {
2845  //printf("------%-30s start\r\n", __FUNCTION__);
2846  ne10_log_buffer_ptr = ne10_log_buffer;
2847 }
2848 
2849 void my_test_teardown (void)
2850 {
2851  //printf("--------end\r\n");
2852 }
2853 
2854 void test_fixture_math (void)
2855 {
2856  test_fixture_start(); // starts a fixture
2857 
2858  fixture_setup (my_test_setup);
2859  fixture_teardown (my_test_teardown);
2860 
2861  run_test (test_abs); // run tests
2862  run_test (test_addc);
2863  run_test (test_add);
2864  run_test (test_cross);
2865  run_test (test_divc);
2866  run_test (test_div);
2867  run_test (test_dot);
2868  run_test (test_len);
2869  run_test (test_mlac);
2870  run_test (test_mla);
2871  run_test (test_mulc);
2872  run_test (test_mul);
2873  run_test (test_normalize);
2874  run_test (test_rsbc);
2875  run_test (test_setc);
2876  run_test (test_subc);
2877  run_test (test_sub);
2878  run_test (test_addmat);
2879  run_test (test_detmat);
2880  run_test (test_identitymat);
2881  run_test (test_invmat);
2882  run_test (test_mulmat);
2883  run_test (test_mulcmatvec);
2884  run_test (test_submat);
2885  run_test (test_transmat);
2886 
2887  test_fixture_end(); // ends a fixture
2888 }
ne10_result_t ne10_mlac_vec2f_c(ne10_vec2f_t *dst, ne10_vec2f_t *acc, ne10_vec2f_t *src, const ne10_vec2f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_mlac_vec2f using plain C code.
Definition: NE10_mlac.c:47
ne10_result_t ne10_len_vec3f_c(ne10_float32_t *dst, ne10_vec3f_t *src, ne10_uint32_t count)
Specific implementation of ne10_len_vec3f using plain C code.
Definition: NE10_len.c:50
ne10_result_t ne10_addc_vec4f_neon(ne10_vec4f_t *dst, ne10_vec4f_t *src, const ne10_vec4f_t *cst, ne10_uint32_t count) asm("ne10_addc_vec4f_neon")
Specific implementation of ne10_addc_vec4f using NEON intrinsics.
ne10_result_t ne10_addmat_3x3f_neon(ne10_mat3x3f_t *dst, ne10_mat3x3f_t *src1, ne10_mat3x3f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_addmat_3x3f using NEON intrinsics.
ne10_result_t ne10_subc_vec3f_neon(ne10_vec3f_t *dst, ne10_vec3f_t *src, const ne10_vec3f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_subc_vec3f using NEON intrinsics.
ne10_result_t ne10_abs_vec4f_neon(ne10_vec4f_t *dst, ne10_vec4f_t *src, ne10_uint32_t count) asm("ne10_abs_vec4f_neon")
Specific implementation of ne10_abs_vec4f using NEON intrinsics.
ne10_result_t ne10_normalize_vec3f_neon(ne10_vec3f_t *dst, ne10_vec3f_t *src, ne10_uint32_t count) asm("ne10_normalize_vec3f_neon")
Specific implementation of ne10_normalize_vec3f using NEON intrinsics.
ne10_result_t ne10_addc_vec3f_neon(ne10_vec3f_t *dst, ne10_vec3f_t *src, const ne10_vec3f_t *cst, ne10_uint32_t count) asm("ne10_addc_vec3f_neon")
Specific implementation of ne10_addc_vec3f using NEON intrinsics.
ne10_result_t ne10_mulc_vec2f_c(ne10_vec2f_t *dst, ne10_vec2f_t *src, const ne10_vec2f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_mulc_vec2f using plain C code.
Definition: NE10_mulc.c:47
ne10_result_t ne10_mlac_vec4f_c(ne10_vec4f_t *dst, ne10_vec4f_t *acc, ne10_vec4f_t *src, const ne10_vec4f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_mlac_vec4f using plain C code.
Definition: NE10_mlac.c:70
ne10_result_t ne10_cross_vec3f_c(ne10_vec3f_t *dst, ne10_vec3f_t *src1, ne10_vec3f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_cross_vec3f using plain C code.
Definition: NE10_cross.c:37
ne10_result_t ne10_abs_vec2f_neon(ne10_vec2f_t *dst, ne10_vec2f_t *src, ne10_uint32_t count) asm("ne10_abs_vec2f_neon")
Specific implementation of ne10_abs_vec2f using NEON intrinsics.
ne10_result_t ne10_divc_float_c(ne10_float32_t *dst, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count)
Specific implementation of ne10_divc_float using plain C code.
Definition: NE10_divc.c:37
ne10_result_t ne10_dot_vec4f_neon(ne10_float32_t *dst, ne10_vec4f_t *src1, ne10_vec4f_t *src2, ne10_uint32_t count) asm("ne10_dot_vec4f_neon")
Specific implementation of ne10_dot_vec4f using NEON intrinsics.
ne10_result_t ne10_rsbc_vec2f_neon(ne10_vec2f_t *dst, ne10_vec2f_t *src, const ne10_vec2f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_rsbc_vec2f using NEON intrinsics.
int32_t ne10_int32_t
Definition: NE10_types.h:76
ne10_result_t ne10_rsbc_vec2f_c(ne10_vec2f_t *dst, ne10_vec2f_t *src, const ne10_vec2f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_rsbc_vec2f using plain C code.
Definition: NE10_rsbc.c:47
ne10_result_t ne10_divc_vec4f_neon(ne10_vec4f_t *dst, ne10_vec4f_t *src, const ne10_vec4f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_divc_vec4f using NEON intrinsics.
ne10_result_t ne10_sub_vec2f_neon(ne10_vec2f_t *dst, ne10_vec2f_t *src1, ne10_vec2f_t *src2, ne10_uint32_t count) asm("ne10_sub_vec2f_neon")
Specific implementation of ne10_sub_vec2f using NEON intrinsics.
ne10_result_t ne10_sub_vec3f_neon(ne10_vec3f_t *dst, ne10_vec3f_t *src1, ne10_vec3f_t *src2, ne10_uint32_t count) asm("ne10_sub_vec3f_neon")
Specific implementation of ne10_sub_vec3f using NEON intrinsics.
ne10_result_t ne10_submat_4x4f_neon(ne10_mat4x4f_t *dst, ne10_mat4x4f_t *src1, ne10_mat4x4f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_submat_4x4f using NEON intrinsics.
ne10_result_t ne10_rsbc_vec3f_c(ne10_vec3f_t *dst, ne10_vec3f_t *src, const ne10_vec3f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_rsbc_vec3f using plain C code.
Definition: NE10_rsbc.c:58
ne10_result_t ne10_transmat_3x3f_c(ne10_mat3x3f_t *dst, ne10_mat3x3f_t *src, ne10_uint32_t count)
Specific implementation of ne10_transmat_3x3f using plain C code.
Definition: NE10_transmat.c:58
ne10_result_t ne10_transmat_2x2f_c(ne10_mat2x2f_t *dst, ne10_mat2x2f_t *src, ne10_uint32_t count)
Specific implementation of ne10_transmat_2x2f using plain C code.
Definition: NE10_transmat.c:45
ne10_result_t ne10_addc_vec3f_c(ne10_vec3f_t *dst, ne10_vec3f_t *src, const ne10_vec3f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_addc_vec3f using plain C code.
Definition: NE10_addc.c:58
ne10_result_t ne10_vmla_vec4f_c(ne10_vec4f_t *dst, ne10_vec4f_t *acc, ne10_vec4f_t *src1, ne10_vec4f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_vmla_vec4f using plain C code.
Definition: NE10_mla.c:70
ne10_result_t ne10_dot_vec4f_c(ne10_float32_t *dst, ne10_vec4f_t *src1, ne10_vec4f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_dot_vec4f using plain C code.
Definition: NE10_dot.c:60
ne10_result_t ne10_cross_vec3f_neon(ne10_vec3f_t *dst, ne10_vec3f_t *src1, ne10_vec3f_t *src2, ne10_uint32_t count) asm("ne10_cross_vec3f_neon")
Specific implementation of ne10_cross_vec3f using NEON intrinsics.
ne10_func_2args_t ftbl_2args[MAX_FUNC_COUNT]
void my_test_setup(void)
ne10_result_t ne10_normalize_vec2f_neon(ne10_vec2f_t *dst, ne10_vec2f_t *src, ne10_uint32_t count) asm("ne10_normalize_vec2f_neon")
Specific implementation of ne10_normalize_vec2f using NEON intrinsics.
ne10_result_t ne10_detmat_3x3f_c(ne10_float32_t *dst, ne10_mat3x3f_t *src, ne10_uint32_t count)
Specific implementation of ne10_detmat_3x3f using plain C code.
Definition: NE10_detmat.c:48
ne10_result_t ne10_addmat_4x4f_neon(ne10_mat4x4f_t *dst, ne10_mat4x4f_t *src1, ne10_mat4x4f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_addmat_4x4f using NEON intrinsics.
ne10_result_t ne10_subc_vec4f_neon(ne10_vec4f_t *dst, ne10_vec4f_t *src, const ne10_vec4f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_subc_vec4f using NEON intrinsics.
float ne10_float32_t
Definition: NE10_types.h:80
ne10_result_t ne10_submat_4x4f_c(ne10_mat4x4f_t *dst, ne10_mat4x4f_t *src1, ne10_mat4x4f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_submat_4x4f using plain C code.
Definition: NE10_submat.c:71
ne10_result_t ne10_invmat_2x2f_neon(ne10_mat2x2f_t *dst, ne10_mat2x2f_t *src, ne10_uint32_t count) asm("ne10_invmat_2x2f_neon")
Specific implementation of ne10_invmat_2x2f using NEON intrinsics.
ne10_result_t ne10_identitymat_3x3f_neon(ne10_mat3x3f_t *dst, ne10_uint32_t count) asm("ne10_identitymat_3x3f_neon")
Specific implementation of ne10_identitymat_3x3f using NEON intrinsics.
ne10_result_t ne10_invmat_4x4f_neon(ne10_mat4x4f_t *dst, ne10_mat4x4f_t *src, ne10_uint32_t count) asm("ne10_invmat_4x4f_neon")
Specific implementation of ne10_invmat_4x4f using NEON intrinsics.
ne10_result_t ne10_identitymat_2x2f_c(ne10_mat2x2f_t *dst, ne10_uint32_t count)
Specific implementation of ne10_identitymat_2x2f using plain C code.
ne10_result_t ne10_mulcmatvec_cm2x2f_v2f_neon(ne10_vec2f_t *dst, const ne10_mat2x2f_t *cst, ne10_vec2f_t *src, ne10_uint32_t count) asm("ne10_mulcmatvec_cm2x2f_v2f_neon")
Specific implementation of ne10_mulcmatvec_cm2x2f_v2f using NEON intrinsics.
ne10_result_t ne10_add_vec2f_neon(ne10_vec2f_t *dst, ne10_vec2f_t *src1, ne10_vec2f_t *src2, ne10_uint32_t count) asm("ne10_add_vec2f_neon")
Specific implementation of ne10_add_vec2f using NEON intrinsics.
ne10_result_t ne10_add_vec2f_c(ne10_vec2f_t *dst, ne10_vec2f_t *src1, ne10_vec2f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_add_vec2f using plain C code.
Definition: NE10_add.c:48
ne10_result_t ne10_add_vec3f_c(ne10_vec3f_t *dst, ne10_vec3f_t *src1, ne10_vec3f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_add_vec3f using plain C code.
Definition: NE10_add.c:59
ne10_result_t ne10_abs_vec4f_c(ne10_vec4f_t *dst, ne10_vec4f_t *src, ne10_uint32_t count)
Specific implementation of ne10_abs_vec4f using plain C code.
Definition: NE10_abs.c:72
ne10_result_t ne10_identitymat_4x4f_neon(ne10_mat4x4f_t *dst, ne10_uint32_t count) asm("ne10_identitymat_4x4f_neon")
Specific implementation of ne10_identitymat_4x4f using NEON intrinsics.
ne10_result_t ne10_add_vec4f_c(ne10_vec4f_t *dst, ne10_vec4f_t *src1, ne10_vec4f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_add_vec4f using plain C code.
Definition: NE10_add.c:71
ne10_result_t ne10_vdiv_vec2f_neon(ne10_vec2f_t *dst, ne10_vec2f_t *src1, ne10_vec2f_t *src2, ne10_uint32_t count) asm("ne10_vdiv_vec2f_neon")
Specific implementation of ne10_vdiv_vec2f using NEON intrinsics.
ne10_result_t ne10_normalize_vec4f_neon(ne10_vec4f_t *dst, ne10_vec4f_t *src, ne10_uint32_t count) asm("ne10_normalize_vec4f_neon")
Specific implementation of ne10_normalize_vec4f using NEON intrinsics.
ne10_result_t ne10_vdiv_vec2f_c(ne10_vec2f_t *dst, ne10_vec2f_t *src1, ne10_vec2f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_vdiv_vec2f using plain C code.
Definition: NE10_div.c:47
ne10_result_t ne10_addc_vec2f_neon(ne10_vec2f_t *dst, ne10_vec2f_t *src, const ne10_vec2f_t *cst, ne10_uint32_t count) asm("ne10_addc_vec2f_neon")
Specific implementation of ne10_addc_vec2f using NEON intrinsics.
ne10_result_t ne10_addmat_2x2f_c(ne10_mat2x2f_t *dst, ne10_mat2x2f_t *src1, ne10_mat2x2f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_addmat_2x2f using plain C code.
Definition: NE10_addmat.c:37
ne10_result_t ne10_detmat_4x4f_neon(ne10_float32_t *dst, ne10_mat4x4f_t *src, ne10_uint32_t count) asm("ne10_detmat_4x4f_neon")
Specific implementation of ne10_detmat_4x4f using NEON intrinsics.
ne10_result_t ne10_mulc_vec3f_neon(ne10_vec3f_t *dst, ne10_vec3f_t *src, const ne10_vec3f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_mulc_vec3f using NEON intrinsics.
ne10_result_t ne10_identitymat_4x4f_c(ne10_mat4x4f_t *dst, ne10_uint32_t count)
Specific implementation of ne10_identitymat_4x4f using plain C code.
ne10_result_t ne10_mlac_vec3f_neon(ne10_vec3f_t *dst, ne10_vec3f_t *acc, ne10_vec3f_t *src, const ne10_vec3f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_mlac_vec3f using NEON intrinsics.
int64_t ne10_int64_t
Definition: NE10_types.h:78
ne10_result_t ne10_mulcmatvec_cm4x4f_v4f_c(ne10_vec4f_t *dst, const ne10_mat4x4f_t *cst, ne10_vec4f_t *src, ne10_uint32_t count)
Specific implementation of ne10_mulcmatvec_cm4x4f_v4f using plain C code.
ne10_result_t ne10_addc_vec2f_c(ne10_vec2f_t *dst, ne10_vec2f_t *src, const ne10_vec2f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_addc_vec2f using plain C code.
Definition: NE10_addc.c:47
ne10_result_t ne10_subc_float_c(ne10_float32_t *dst, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count)
Specific implementation of ne10_subc_float using plain C code.
Definition: NE10_subc.c:37
ne10_result_t ne10_sub_float_neon(ne10_float32_t *dst, ne10_float32_t *src1, ne10_float32_t *src2, ne10_uint32_t count) asm("ne10_sub_float_neon")
Specific implementation of ne10_sub_float using NEON intrinsics.
ne10_result_t ne10_mulc_vec3f_c(ne10_vec3f_t *dst, ne10_vec3f_t *src, const ne10_vec3f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_mulc_vec3f using plain C code.
Definition: NE10_mulc.c:58
ne10_result_t ne10_mulc_float_neon(ne10_float32_t *dst, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count)
Specific implementation of ne10_mulc_float using NEON intrinsics.
uint32_t ne10_uint32_t
Definition: NE10_types.h:77
ne10_result_t ne10_sub_vec4f_c(ne10_vec4f_t *dst, ne10_vec4f_t *src1, ne10_vec4f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_sub_vec4f using plain C code.
Definition: NE10_sub.c:70
ne10_result_t ne10_mulmat_3x3f_neon(ne10_mat3x3f_t *dst, ne10_mat3x3f_t *src1, ne10_mat3x3f_t *src2, ne10_uint32_t count) asm("ne10_mulmat_3x3f_neon")
Specific implementation of ne10_mulmat_3x3f using NEON intrinsics.
ne10_result_t ne10_mulc_float_c(ne10_float32_t *dst, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count)
Specific implementation of ne10_mulc_float using plain C code.
Definition: NE10_mulc.c:37
ne10_result_t ne10_len_vec4f_c(ne10_float32_t *dst, ne10_vec4f_t *src, ne10_uint32_t count)
Specific implementation of ne10_len_vec4f using plain C code.
Definition: NE10_len.c:62
ne10_result_t ne10_setc_vec4f_neon(ne10_vec4f_t *dst, const ne10_vec4f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_setc_vec4f using NEON intrinsics.
ne10_result_t ne10_identitymat_3x3f_c(ne10_mat3x3f_t *dst, ne10_uint32_t count)
Specific implementation of ne10_identitymat_3x3f using plain C code.
ne10_result_t ne10_normalize_vec4f_c(ne10_vec4f_t *dst, ne10_vec4f_t *src, ne10_uint32_t count)
Specific implementation of ne10_normalize_vec4f using plain C code.
ne10_result_t ne10_vmla_vec3f_c(ne10_vec3f_t *dst, ne10_vec3f_t *acc, ne10_vec3f_t *src1, ne10_vec3f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_vmla_vec3f using plain C code.
Definition: NE10_mla.c:58
ne10_result_t ne10_normalize_vec3f_c(ne10_vec3f_t *dst, ne10_vec3f_t *src, ne10_uint32_t count)
Specific implementation of ne10_normalize_vec3f using plain C code.
ne10_result_t ne10_mulcmatvec_cm3x3f_v3f_neon(ne10_vec3f_t *dst, const ne10_mat3x3f_t *cst, ne10_vec3f_t *src, ne10_uint32_t count) asm("ne10_mulcmatvec_cm3x3f_v3f_neon")
Specific implementation of ne10_mulcmatvec_cm3x3f_v3f using NEON intrinsics.
ne10_result_t ne10_setc_vec2f_neon(ne10_vec2f_t *dst, const ne10_vec2f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_setc_vec2f using NEON intrinsics.
void my_test_teardown(void)
ne10_result_t ne10_submat_2x2f_neon(ne10_mat2x2f_t *dst, ne10_mat2x2f_t *src1, ne10_mat2x2f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_submat_2x2f using NEON intrinsics.
ne10_result_t ne10_vdiv_vec4f_neon(ne10_vec4f_t *dst, ne10_vec4f_t *src1, ne10_vec4f_t *src2, ne10_uint32_t count) asm("ne10_vdiv_vec4f_neon")
Specific implementation of ne10_vdiv_vec4f using NEON intrinsics.
ne10_result_t ne10_len_vec2f_c(ne10_float32_t *dst, ne10_vec2f_t *src, ne10_uint32_t count)
Specific implementation of ne10_len_vec2f using plain C code.
Definition: NE10_len.c:39
ne10_result_t ne10_divc_vec3f_neon(ne10_vec3f_t *dst, ne10_vec3f_t *src, const ne10_vec3f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_divc_vec3f using NEON intrinsics.
ne10_result_t ne10_abs_float_c(ne10_float32_t *dst, ne10_float32_t *src, ne10_uint32_t count)
Specific implementation of ne10_abs_float using plain C code.
Definition: NE10_abs.c:39
ne10_result_t ne10_rsbc_vec3f_neon(ne10_vec3f_t *dst, ne10_vec3f_t *src, const ne10_vec3f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_rsbc_vec3f using NEON intrinsics.
ne10_result_t ne10_addc_float_neon(ne10_float32_t *dst, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count) asm("ne10_addc_float_neon")
Specific implementation of ne10_addc_float using NEON intrinsics.
ne10_result_t ne10_vmla_vec2f_c(ne10_vec2f_t *dst, ne10_vec2f_t *acc, ne10_vec2f_t *src1, ne10_vec2f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_vmla_vec2f using plain C code.
Definition: NE10_mla.c:47
ne10_result_t ne10_vmul_vec3f_c(ne10_vec3f_t *dst, ne10_vec3f_t *src1, ne10_vec3f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_vmul_vec3f using plain C code.
Definition: NE10_mul.c:58
ne10_result_t ne10_identitymat_2x2f_neon(ne10_mat2x2f_t *dst, ne10_uint32_t count) asm("ne10_identitymat_2x2f_neon")
Specific implementation of ne10_identitymat_2x2f using NEON intrinsics.
ne10_result_t ne10_vmla_vec2f_neon(ne10_vec2f_t *dst, ne10_vec2f_t *acc, ne10_vec2f_t *src1, ne10_vec2f_t *src2, ne10_uint32_t count) asm("ne10_vmla_vec2f_neon")
Specific implementation of ne10_vmla_vec2f using NEON intrinsics.
ne10_result_t ne10_vdiv_vec3f_neon(ne10_vec3f_t *dst, ne10_vec3f_t *src1, ne10_vec3f_t *src2, ne10_uint32_t count) asm("ne10_vdiv_vec3f_neon")
Specific implementation of ne10_vdiv_vec3f using NEON intrinsics.
ne10_result_t ne10_mulcmatvec_cm2x2f_v2f_c(ne10_vec2f_t *dst, const ne10_mat2x2f_t *cst, ne10_vec2f_t *src, ne10_uint32_t count)
Specific implementation of ne10_mulcmatvec_cm2x2f_v2f using plain C code.
ne10_result_t ne10_div_float_c(ne10_float32_t *dst, ne10_float32_t *src1, ne10_float32_t *src2, ne10_uint32_t count)
Specific implementation of ne10_div_float using plain C code.
Definition: NE10_div.c:37
ne10_result_t ne10_mul_float_neon(ne10_float32_t *dst, ne10_float32_t *src1, ne10_float32_t *src2, ne10_uint32_t count) asm("ne10_mul_float_neon")
Specific implementation of ne10_mul_float using NEON intrinsics.
ne10_result_t ne10_vmul_vec2f_c(ne10_vec2f_t *dst, ne10_vec2f_t *src1, ne10_vec2f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_vmul_vec2f using plain C code.
Definition: NE10_mul.c:47
ne10_result_t ne10_mulmat_2x2f_neon(ne10_mat2x2f_t *dst, ne10_mat2x2f_t *src1, ne10_mat2x2f_t *src2, ne10_uint32_t count) asm("ne10_mulmat_2x2f_neon")
Specific implementation of ne10_mulmat_2x2f using NEON intrinsics.
ne10_result_t ne10_detmat_2x2f_c(ne10_float32_t *dst, ne10_mat2x2f_t *src, ne10_uint32_t count)
Specific implementation of ne10_detmat_2x2f using plain C code.
Definition: NE10_detmat.c:38
ne10_result_t ne10_dot_vec3f_c(ne10_float32_t *dst, ne10_vec3f_t *src1, ne10_vec3f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_dot_vec3f using plain C code.
Definition: NE10_dot.c:48
ne10_result_t ne10_mulcmatvec_cm4x4f_v4f_neon(ne10_vec4f_t *dst, const ne10_mat4x4f_t *cst, ne10_vec4f_t *src, ne10_uint32_t count) asm("ne10_mulcmatvec_cm4x4f_v4f_neon")
Specific implementation of ne10_mulcmatvec_cm4x4f_v4f using NEON intrinsics.
ne10_func_5args_cst_t ftbl_5args_cst[MAX_FUNC_COUNT]
ne10_result_t ne10_transmat_3x3f_neon(ne10_mat3x3f_t *dst, ne10_mat3x3f_t *src, ne10_uint32_t count) asm("ne10_transmat_3x3f_neon")
Specific implementation of ne10_transmat_3x3f using NEON intrinsics.
ne10_result_t ne10_vmul_vec4f_neon(ne10_vec4f_t *dst, ne10_vec4f_t *src1, ne10_vec4f_t *src2, ne10_uint32_t count) asm("ne10_vmul_vec4f_neon")
Specific implementation of ne10_vmul_vec4f using NEON intrinsics.
ne10_result_t ne10_mlac_float_neon(ne10_float32_t *dst, ne10_float32_t *acc, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count)
Specific implementation of ne10_mlac_float using NEON intrinsics.
ne10_result_t ne10_detmat_4x4f_c(ne10_float32_t *dst, ne10_mat4x4f_t *src, ne10_uint32_t count)
Specific implementation of ne10_detmat_4x4f using plain C code.
Definition: NE10_detmat.c:58
ne10_result_t ne10_submat_3x3f_neon(ne10_mat3x3f_t *dst, ne10_mat3x3f_t *src1, ne10_mat3x3f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_submat_3x3f using NEON intrinsics.
ne10_result_t ne10_addc_vec4f_c(ne10_vec4f_t *dst, ne10_vec4f_t *src, const ne10_vec4f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_addc_vec4f using plain C code.
Definition: NE10_addc.c:70
ne10_result_t ne10_mlac_vec3f_c(ne10_vec3f_t *dst, ne10_vec3f_t *acc, ne10_vec3f_t *src, const ne10_vec3f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_mlac_vec3f using plain C code.
Definition: NE10_mlac.c:58
ne10_func_3args_cst_t ftbl_3args_cst[MAX_FUNC_COUNT]
ne10_result_t ne10_addc_float_c(ne10_float32_t *dst, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count)
Specific implementation of ne10_addc_float using plain C code.
Definition: NE10_addc.c:37
ne10_result_t ne10_setc_vec3f_neon(ne10_vec3f_t *dst, const ne10_vec3f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_setc_vec3f using NEON intrinsics.
ne10_result_t ne10_setc_vec3f_c(ne10_vec3f_t *dst, const ne10_vec3f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_setc_vec3f using plain C code.
Definition: NE10_setc.c:58
ne10_result_t ne10_abs_vec3f_neon(ne10_vec3f_t *dst, ne10_vec3f_t *src, ne10_uint32_t count) asm("ne10_abs_vec3f_neon")
Specific implementation of ne10_abs_vec3f using NEON intrinsics.
ne10_func_4args_cst_t ftbl_4args_cst[MAX_FUNC_COUNT]
ne10_result_t ne10_mulc_vec4f_c(ne10_vec4f_t *dst, ne10_vec4f_t *src, const ne10_vec4f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_mulc_vec4f using plain C code.
Definition: NE10_mulc.c:70
ne10_result_t ne10_rsbc_float_neon(ne10_float32_t *dst, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count)
Specific implementation of ne10_rsbc_float using NEON intrinsics.
ne10_result_t ne10_len_vec2f_neon(ne10_float32_t *dst, ne10_vec2f_t *src, ne10_uint32_t count) asm("ne10_len_vec2f_neon")
Specific implementation of ne10_len_vec2f using NEON intrinsics.
ne10_result_t ne10_abs_vec2f_c(ne10_vec2f_t *dst, ne10_vec2f_t *src, ne10_uint32_t count)
Specific implementation of ne10_abs_vec2f using plain C code.
Definition: NE10_abs.c:49
ne10_result_t ne10_mulmat_2x2f_c(ne10_mat2x2f_t *dst, ne10_mat2x2f_t *src1, ne10_mat2x2f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_mulmat_2x2f using plain C code.
Definition: NE10_mulmat.c:37
ne10_result_t ne10_setc_vec2f_c(ne10_vec2f_t *dst, const ne10_vec2f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_setc_vec2f using plain C code.
Definition: NE10_setc.c:47
ne10_result_t ne10_submat_3x3f_c(ne10_mat3x3f_t *dst, ne10_mat3x3f_t *src1, ne10_mat3x3f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_submat_3x3f using plain C code.
Definition: NE10_submat.c:51
ne10_result_t ne10_abs_vec3f_c(ne10_vec3f_t *dst, ne10_vec3f_t *src, ne10_uint32_t count)
Specific implementation of ne10_abs_vec3f using plain C code.
Definition: NE10_abs.c:60
ne10_result_t ne10_add_vec3f_neon(ne10_vec3f_t *dst, ne10_vec3f_t *src1, ne10_vec3f_t *src2, ne10_uint32_t count) asm("ne10_add_vec3f_neon")
Specific implementation of ne10_add_vec3f using NEON intrinsics.
ne10_result_t ne10_abs_float_neon(ne10_float32_t *dst, ne10_float32_t *src, ne10_uint32_t count) asm("ne10_abs_float_neon")
Specific implementation of ne10_abs_float using NEON intrinsics.
ne10_func_5args_t ftbl_5args[MAX_FUNC_COUNT]
ne10_result_t ne10_divc_float_neon(ne10_float32_t *dst, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count)
Specific implementation of ne10_divc_float using NEON intrinsics.
ne10_result_t ne10_vmul_vec3f_neon(ne10_vec3f_t *dst, ne10_vec3f_t *src1, ne10_vec3f_t *src2, ne10_uint32_t count) asm("ne10_vmul_vec3f_neon")
Specific implementation of ne10_vmul_vec3f using NEON intrinsics.
ne10_result_t ne10_rsbc_vec4f_c(ne10_vec4f_t *dst, ne10_vec4f_t *src, const ne10_vec4f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_rsbc_vec4f using plain C code.
Definition: NE10_rsbc.c:70
ne10_result_t ne10_dot_vec2f_c(ne10_float32_t *dst, ne10_vec2f_t *src1, ne10_vec2f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_dot_vec2f using plain C code.
Definition: NE10_dot.c:37
ne10_result_t ne10_subc_vec2f_c(ne10_vec2f_t *dst, ne10_vec2f_t *src, const ne10_vec2f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_subc_vec2f using plain C code.
Definition: NE10_subc.c:47
ne10_result_t ne10_transmat_4x4f_c(ne10_mat4x4f_t *dst, ne10_mat4x4f_t *src, ne10_uint32_t count)
Specific implementation of ne10_transmat_4x4f using plain C code.
Definition: NE10_transmat.c:78
ne10_result_t ne10_vdiv_vec3f_c(ne10_vec3f_t *dst, ne10_vec3f_t *src1, ne10_vec3f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_vdiv_vec3f using plain C code.
Definition: NE10_div.c:58
ne10_result_t ne10_invmat_4x4f_c(ne10_mat4x4f_t *dst, ne10_mat4x4f_t *src, ne10_uint32_t count)
Specific implementation of ne10_invmat_4x4f using plain C code.
Definition: NE10_invmat.c:127
ne10_result_t ne10_vmla_vec4f_neon(ne10_vec4f_t *dst, ne10_vec4f_t *acc, ne10_vec4f_t *src1, ne10_vec4f_t *src2, ne10_uint32_t count) asm("ne10_vmla_vec4f_neon")
Specific implementation of ne10_vmla_vec4f using NEON intrinsics.
ne10_result_t ne10_vdiv_vec4f_c(ne10_vec4f_t *dst, ne10_vec4f_t *src1, ne10_vec4f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_vdiv_vec4f using plain C code.
Definition: NE10_div.c:70
ne10_result_t ne10_vmul_vec2f_neon(ne10_vec2f_t *dst, ne10_vec2f_t *src1, ne10_vec2f_t *src2, ne10_uint32_t count) asm("ne10_vmul_vec2f_neon")
Specific implementation of ne10_vmul_vec2f using NEON intrinsics.
ne10_result_t ne10_divc_vec3f_c(ne10_vec3f_t *dst, ne10_vec3f_t *src, const ne10_vec3f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_divc_vec3f using plain C code.
Definition: NE10_divc.c:58
ne10_result_t ne10_detmat_3x3f_neon(ne10_float32_t *dst, ne10_mat3x3f_t *src, ne10_uint32_t count) asm("ne10_detmat_3x3f_neon")
Specific implementation of ne10_detmat_3x3f using NEON intrinsics.
ne10_result_t ne10_mulmat_3x3f_c(ne10_mat3x3f_t *dst, ne10_mat3x3f_t *src1, ne10_mat3x3f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_mulmat_3x3f using plain C code.
Definition: NE10_mulmat.c:69
ne10_func_3args_t ftbl_3args[MAX_FUNC_COUNT]
ne10_result_t ne10_transmat_2x2f_neon(ne10_mat2x2f_t *dst, ne10_mat2x2f_t *src, ne10_uint32_t count) asm("ne10_transmat_2x2f_neon")
Specific implementation of ne10_transmat_2x2f using NEON intrinsics.
ne10_result_t ne10_dot_vec2f_neon(ne10_float32_t *dst, ne10_vec2f_t *src1, ne10_vec2f_t *src2, ne10_uint32_t count) asm("ne10_dot_vec2f_neon")
Specific implementation of ne10_dot_vec2f using NEON intrinsics.
ne10_result_t ne10_vmla_vec3f_neon(ne10_vec3f_t *dst, ne10_vec3f_t *acc, ne10_vec3f_t *src1, ne10_vec3f_t *src2, ne10_uint32_t count) asm("ne10_vmla_vec3f_neon")
Specific implementation of ne10_vmla_vec3f using NEON intrinsics.
ne10_result_t ne10_sub_float_c(ne10_float32_t *dst, ne10_float32_t *src1, ne10_float32_t *src2, ne10_uint32_t count)
Specific implementation of ne10_sub_float using plain C code.
Definition: NE10_sub.c:37
ne10_result_t ne10_mul_float_c(ne10_float32_t *dst, ne10_float32_t *src1, ne10_float32_t *src2, ne10_uint32_t count)
Specific implementation of ne10_mul_float using plain C code.
Definition: NE10_mul.c:37
ne10_result_t ne10_addmat_3x3f_c(ne10_mat3x3f_t *dst, ne10_mat3x3f_t *src1, ne10_mat3x3f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_addmat_3x3f using plain C code.
Definition: NE10_addmat.c:51
ne10_result_t ne10_len_vec4f_neon(ne10_float32_t *dst, ne10_vec4f_t *src, ne10_uint32_t count) asm("ne10_len_vec4f_neon")
Specific implementation of ne10_len_vec4f using NEON intrinsics.
ne10_result_t ne10_sub_vec4f_neon(ne10_vec4f_t *dst, ne10_vec4f_t *src1, ne10_vec4f_t *src2, ne10_uint32_t count) asm("ne10_sub_vec4f_neon")
Specific implementation of ne10_sub_vec4f using NEON intrinsics.
ne10_result_t ne10_dot_vec3f_neon(ne10_float32_t *dst, ne10_vec3f_t *src1, ne10_vec3f_t *src2, ne10_uint32_t count) asm("ne10_dot_vec3f_neon")
Specific implementation of ne10_dot_vec3f using NEON intrinsics.
ne10_result_t ne10_mulmat_4x4f_neon(ne10_mat4x4f_t *dst, ne10_mat4x4f_t *src1, ne10_mat4x4f_t *src2, ne10_uint32_t count) asm("ne10_mulmat_4x4f_neon")
Specific implementation of ne10_mulmat_4x4f using NEON intrinsics.
ne10_result_t ne10_rsbc_float_c(ne10_float32_t *dst, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count)
Specific implementation of ne10_rsbc_float using plain C code.
Definition: NE10_rsbc.c:37
ne10_result_t ne10_setc_vec4f_c(ne10_vec4f_t *dst, const ne10_vec4f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_setc_vec4f using plain C code.
Definition: NE10_setc.c:70
ne10_result_t ne10_add_float_c(ne10_float32_t *dst, ne10_float32_t *src1, ne10_float32_t *src2, ne10_uint32_t count)
Specific implementation of ne10_add_float using plain C code.
Definition: NE10_add.c:38
ne10_result_t ne10_subc_float_neon(ne10_float32_t *dst, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count)
Specific implementation of ne10_subc_float using NEON intrinsics.
ne10_result_t ne10_mlac_vec4f_neon(ne10_vec4f_t *dst, ne10_vec4f_t *acc, ne10_vec4f_t *src, const ne10_vec4f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_mlac_vec4f using NEON intrinsics.
ne10_result_t ne10_transmat_4x4f_neon(ne10_mat4x4f_t *dst, ne10_mat4x4f_t *src, ne10_uint32_t count) asm("ne10_transmat_4x4f_neon")
Specific implementation of ne10_transmat_4x4f using NEON intrinsics.
ne10_result_t ne10_add_float_neon(ne10_float32_t *dst, ne10_float32_t *src1, ne10_float32_t *src2, ne10_uint32_t count) asm("ne10_add_float_neon")
Specific implementation of ne10_add_float using NEON intrinsics.
ne10_result_t ne10_vmul_vec4f_c(ne10_vec4f_t *dst, ne10_vec4f_t *src1, ne10_vec4f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_vmul_vec4f using plain C code.
Definition: NE10_mul.c:70
ne10_result_t ne10_sub_vec2f_c(ne10_vec2f_t *dst, ne10_vec2f_t *src1, ne10_vec2f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_sub_vec2f using plain C code.
Definition: NE10_sub.c:47
ne10_result_t ne10_len_vec3f_neon(ne10_float32_t *dst, ne10_vec3f_t *src, ne10_uint32_t count) asm("ne10_len_vec3f_neon")
Specific implementation of ne10_len_vec3f using NEON intrinsics.
ne10_result_t ne10_invmat_3x3f_neon(ne10_mat3x3f_t *dst, ne10_mat3x3f_t *src, ne10_uint32_t count) asm("ne10_invmat_3x3f_neon")
Specific implementation of ne10_invmat_3x3f using NEON intrinsics.
ne10_result_t ne10_submat_2x2f_c(ne10_mat2x2f_t *dst, ne10_mat2x2f_t *src1, ne10_mat2x2f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_submat_2x2f using plain C code.
Definition: NE10_submat.c:37
ne10_result_t ne10_mlac_vec2f_neon(ne10_vec2f_t *dst, ne10_vec2f_t *acc, ne10_vec2f_t *src, const ne10_vec2f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_mlac_vec2f using NEON intrinsics.
ne10_result_t ne10_detmat_2x2f_neon(ne10_float32_t *dst, ne10_mat2x2f_t *src, ne10_uint32_t count) asm("ne10_detmat_2x2f_neon")
Specific implementation of ne10_detmat_2x2f using NEON intrinsics.
ne10_result_t ne10_div_float_neon(ne10_float32_t *dst, ne10_float32_t *src1, ne10_float32_t *src2, ne10_uint32_t count) asm("ne10_div_float_neon")
Specific implementation of ne10_div_float using NEON intrinsics.
ne10_result_t ne10_add_vec4f_neon(ne10_vec4f_t *dst, ne10_vec4f_t *src1, ne10_vec4f_t *src2, ne10_uint32_t count) asm("ne10_add_vec4f_neon")
Specific implementation of ne10_add_vec4f using NEON intrinsics.
ne10_result_t ne10_invmat_3x3f_c(ne10_mat3x3f_t *dst, ne10_mat3x3f_t *src, ne10_uint32_t count)
Specific implementation of ne10_invmat_3x3f using plain C code.
Definition: NE10_invmat.c:65
ne10_result_t ne10_setc_float_neon(ne10_float32_t *dst, const ne10_float32_t cst, ne10_uint32_t count)
Specific implementation of ne10_setc_float using NEON intrinsics.
ne10_result_t ne10_addmat_2x2f_neon(ne10_mat2x2f_t *dst, ne10_mat2x2f_t *src1, ne10_mat2x2f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_addmat_2x2f using NEON intrinsics.
ne10_result_t ne10_normalize_vec2f_c(ne10_vec2f_t *dst, ne10_vec2f_t *src, ne10_uint32_t count)
Specific implementation of ne10_normalize_vec2f using plain C code.
ne10_result_t ne10_mulc_vec2f_neon(ne10_vec2f_t *dst, ne10_vec2f_t *src, const ne10_vec2f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_mulc_vec2f using NEON intrinsics.
ne10_result_t ne10_divc_vec4f_c(ne10_vec4f_t *dst, ne10_vec4f_t *src, const ne10_vec4f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_divc_vec4f using plain C code.
Definition: NE10_divc.c:70
ne10_result_t ne10_subc_vec3f_c(ne10_vec3f_t *dst, ne10_vec3f_t *src, const ne10_vec3f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_subc_vec3f using plain C code.
Definition: NE10_subc.c:58
ne10_result_t ne10_rsbc_vec4f_neon(ne10_vec4f_t *dst, ne10_vec4f_t *src, const ne10_vec4f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_rsbc_vec4f using NEON intrinsics.
ne10_result_t ne10_addmat_4x4f_c(ne10_mat4x4f_t *dst, ne10_mat4x4f_t *src1, ne10_mat4x4f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_addmat_4x4f using plain C code.
Definition: NE10_addmat.c:71
ne10_result_t ne10_divc_vec2f_neon(ne10_vec2f_t *dst, ne10_vec2f_t *src, const ne10_vec2f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_divc_vec2f using NEON intrinsics.
ne10_result_t ne10_subc_vec4f_c(ne10_vec4f_t *dst, ne10_vec4f_t *src, const ne10_vec4f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_subc_vec4f using plain C code.
Definition: NE10_subc.c:70
ne10_result_t ne10_mulc_vec4f_neon(ne10_vec4f_t *dst, ne10_vec4f_t *src, const ne10_vec4f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_mulc_vec4f using NEON intrinsics.
ne10_func_4args_t ftbl_4args[MAX_FUNC_COUNT]
ne10_result_t ne10_mulcmatvec_cm3x3f_v3f_c(ne10_vec3f_t *dst, const ne10_mat3x3f_t *cst, ne10_vec3f_t *src, ne10_uint32_t count)
Specific implementation of ne10_mulcmatvec_cm3x3f_v3f using plain C code.
#define MAX_VEC_COMPONENTS
ne10_result_t ne10_mulmat_4x4f_c(ne10_mat4x4f_t *dst, ne10_mat4x4f_t *src1, ne10_mat4x4f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_mulmat_4x4f using plain C code.
Definition: NE10_mulmat.c:127
ne10_result_t ne10_subc_vec2f_neon(ne10_vec2f_t *dst, ne10_vec2f_t *src, const ne10_vec2f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_subc_vec2f using NEON intrinsics.
ne10_result_t ne10_mlac_float_c(ne10_float32_t *dst, ne10_float32_t *acc, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count)
Specific implementation of ne10_mlac_float using plain C code.
Definition: NE10_mlac.c:37
ne10_result_t ne10_mla_float_c(ne10_float32_t *dst, ne10_float32_t *acc, ne10_float32_t *src1, ne10_float32_t *src2, ne10_uint32_t count)
Specific implementation of ne10_mla_float using plain C code.
Definition: NE10_mla.c:37
ne10_result_t ne10_divc_vec2f_c(ne10_vec2f_t *dst, ne10_vec2f_t *src, const ne10_vec2f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_divc_vec2f using plain C code.
Definition: NE10_divc.c:47
ne10_result_t ne10_invmat_2x2f_c(ne10_mat2x2f_t *dst, ne10_mat2x2f_t *src, ne10_uint32_t count)
Specific implementation of ne10_invmat_2x2f using plain C code.
Definition: NE10_invmat.c:42
ne10_result_t ne10_mla_float_neon(ne10_float32_t *dst, ne10_float32_t *acc, ne10_float32_t *src1, ne10_float32_t *src2, ne10_uint32_t count) asm("ne10_mla_float_neon")
Specific implementation of ne10_mla_float using NEON intrinsics.
ne10_result_t ne10_sub_vec3f_c(ne10_vec3f_t *dst, ne10_vec3f_t *src1, ne10_vec3f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_sub_vec3f using plain C code.
Definition: NE10_sub.c:58
ne10_result_t ne10_setc_float_c(ne10_float32_t *dst, const ne10_float32_t cst, ne10_uint32_t count)
Specific implementation of ne10_setc_float using plain C code.
Definition: NE10_setc.c:37