49 #if defined (SMOKE_TEST)||(REGRESSION_TEST) 65 #ifdef PERFORMANCE_TEST 89 #define MAX_VEC_COMPONENTS 4 104 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
106 #if defined (SMOKE_TEST)||(REGRESSION_TEST) 112 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
115 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
116 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
120 for (loop = 0; loop < TEST_ITERATION; loop++)
122 vec_size = func_loop + 1;
124 GUARD_ARRAY (thedst_c, loop * vec_size);
125 GUARD_ARRAY (thedst_neon, loop * vec_size);
127 ftbl_3args[2 * func_loop] (thedst_c, thesrc1, loop);
128 ftbl_3args[2 * func_loop + 1] (thedst_neon, thesrc1, loop);
130 assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
131 assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
133 for (pos = 0; pos < loop; pos++)
137 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
138 for (i = 0; i < vec_size; i++)
140 fprintf (stdout,
"thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
143 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
148 free (guarded_dst_c);
149 free (guarded_dst_neon);
152 #ifdef PERFORMANCE_TEST 153 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time (micro-s)",
"NEON Time (micro-s)",
"Time Savings",
"Performance Ratio");
156 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
159 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
160 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
165 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_3args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, loop);
168 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_3args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, loop);
171 time_savings = ( ( (
ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
172 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
175 free (perftest_guarded_src1);
176 free (perftest_guarded_dst_c);
177 free (perftest_guarded_dst_neon);
180 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
181 #undef MAX_VEC_COMPONENTS 184 void test_addc_case0()
186 #define MAX_VEC_COMPONENTS 4 190 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
204 #if defined (SMOKE_TEST)||(REGRESSION_TEST) 210 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
211 NE10_SRC_ALLOC_LIMIT (thecst, guarded_cst, MAX_VEC_COMPONENTS);
214 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
215 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
219 for (loop = 0; loop < TEST_ITERATION; loop++)
221 vec_size = func_loop + 1;
223 GUARD_ARRAY (thedst_c, loop * vec_size);
224 GUARD_ARRAY (thedst_neon, loop * vec_size);
228 ftbl_4args_cst[2 * func_loop] (thedst_c, thesrc1, thecst[0], loop);
229 ftbl_4args_cst[2 * func_loop + 1] (thedst_neon, thesrc1, thecst[0], loop);
233 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thecst, loop);
234 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thecst, loop);
238 assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
239 assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
241 for (pos = 0; pos < loop; pos++)
245 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
246 for (i = 0; i < vec_size; i++)
248 fprintf (stdout,
"thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
249 fprintf (stdout,
"thecst->%d: %e [0x%04X] \n", i, thecst[i], * (
ne10_uint32_t*) &thecst[i]);
252 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
258 free (guarded_dst_c);
259 free (guarded_dst_neon);
262 #ifdef PERFORMANCE_TEST 263 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time (micro-s)",
"NEON Time (micro-s)",
"Time Savings",
"Performance Ratio");
266 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
267 NE10_SRC_ALLOC_LIMIT (perftest_thecst, perftest_guarded_cst, MAX_VEC_COMPONENTS);
270 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
271 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
273 for (func_loop = 0; func_loop < 1; func_loop++)
276 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args_cst[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst[0], loop);
279 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args_cst[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst[0], loop);
282 time_savings = ( ( (
ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
283 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
288 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst, loop);
291 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst, loop);
294 time_savings = ( ( (
ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
295 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
298 free (perftest_guarded_src1);
299 free (perftest_guarded_cst);
300 free (perftest_guarded_dst_c);
301 free (perftest_guarded_dst_neon);
304 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
305 #undef MAX_VEC_COMPONENTS 308 void test_add_case0()
310 #define MAX_VEC_COMPONENTS 4 314 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
327 #if defined (SMOKE_TEST)||(REGRESSION_TEST) 333 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
334 NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length);
337 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
338 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
342 for (loop = 0; loop < TEST_ITERATION; loop++)
344 vec_size = func_loop + 1;
346 GUARD_ARRAY (thedst_c, loop * vec_size);
347 GUARD_ARRAY (thedst_neon, loop * vec_size);
349 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
350 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
352 assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
353 assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
355 for (pos = 0; pos < loop; pos++)
359 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
360 for (i = 0; i < vec_size; i++)
362 fprintf (stdout,
"thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
363 fprintf (stdout,
"thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
366 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
372 free (guarded_dst_c);
373 free (guarded_dst_neon);
376 #ifdef PERFORMANCE_TEST 377 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time (micro-s)",
"NEON Time (micro-s)",
"Time Savings",
"Performance Ratio");
380 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
381 NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length);
384 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
385 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
390 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
393 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
396 time_savings = ( ( (
ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
397 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
400 free (perftest_guarded_src1);
401 free (perftest_guarded_src2);
402 free (perftest_guarded_dst_c);
403 free (perftest_guarded_dst_neon);
406 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
407 #undef MAX_VEC_COMPONENTS 410 void test_cross_case0()
412 #define MAX_VEC_COMPONENTS 3 416 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
423 #if defined (SMOKE_TEST)||(REGRESSION_TEST) 429 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
430 NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length);
433 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
434 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
438 for (loop = 0; loop < TEST_ITERATION; loop++)
440 vec_size = func_loop + 1;
442 GUARD_ARRAY (thedst_c, loop * vec_size);
443 GUARD_ARRAY (thedst_neon, loop * vec_size);
445 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
446 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
448 assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
449 assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
451 for (pos = 0; pos < loop; pos++)
455 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
456 for (i = 0; i < vec_size; i++)
458 fprintf (stdout,
"thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
459 fprintf (stdout,
"thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
462 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
468 free (guarded_dst_c);
469 free (guarded_dst_neon);
472 #ifdef PERFORMANCE_TEST 473 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time (micro-s)",
"NEON Time (micro-s)",
"Time Savings",
"Performance Ratio");
476 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
477 NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length);
480 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
481 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
486 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
489 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
492 time_savings = ( ( (
ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
493 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
496 free (perftest_guarded_src1);
497 free (perftest_guarded_src2);
498 free (perftest_guarded_dst_c);
499 free (perftest_guarded_dst_neon);
502 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
503 #undef MAX_VEC_COMPONENTS 506 void test_divc_case0()
508 #define MAX_VEC_COMPONENTS 4 512 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
526 #if defined (SMOKE_TEST)||(REGRESSION_TEST) 532 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
533 NE10_SRC_ALLOC_LIMIT (thecst, guarded_cst, MAX_VEC_COMPONENTS);
536 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
537 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
541 for (loop = 0; loop < TEST_ITERATION; loop++)
543 vec_size = func_loop + 1;
545 GUARD_ARRAY (thedst_c, loop * vec_size);
546 GUARD_ARRAY (thedst_neon, loop * vec_size);
550 ftbl_4args_cst[2 * func_loop] (thedst_c, thesrc1, thecst[0], loop);
551 ftbl_4args_cst[2 * func_loop + 1] (thedst_neon, thesrc1, thecst[0], loop);
555 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thecst, loop);
556 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thecst, loop);
559 assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
560 assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
562 for (pos = 0; pos < loop; pos++)
566 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
567 for (i = 0; i < vec_size; i++)
569 fprintf (stdout,
"thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
570 fprintf (stdout,
"thecst->%d: %e [0x%04X] \n", i, thecst[i], * (
ne10_uint32_t*) &thecst[i]);
573 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
579 free (guarded_dst_c);
580 free (guarded_dst_neon);
583 #ifdef PERFORMANCE_TEST 584 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time (micro-s)",
"NEON Time (micro-s)",
"Time Savings",
"Performance Ratio");
587 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
588 NE10_SRC_ALLOC_LIMIT (perftest_thecst, perftest_guarded_cst, MAX_VEC_COMPONENTS);
591 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
592 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
594 for (func_loop = 0; func_loop < 1; func_loop++)
597 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args_cst[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst[0], loop);
600 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args_cst[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst[0], loop);
603 time_savings = ( ( (
ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
604 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
609 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst, loop);
612 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst, loop);
615 time_savings = ( ( (
ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
616 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
619 free (perftest_guarded_src1);
620 free (perftest_guarded_cst);
621 free (perftest_guarded_dst_c);
622 free (perftest_guarded_dst_neon);
625 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
626 #undef MAX_VEC_COMPONENTS 629 void test_div_case0()
631 #define MAX_VEC_COMPONENTS 4 635 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
648 #if defined (SMOKE_TEST)||(REGRESSION_TEST) 654 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
655 NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length);
658 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
659 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
663 for (loop = 0; loop < TEST_ITERATION; loop++)
665 vec_size = func_loop + 1;
667 GUARD_ARRAY (thedst_c, loop * vec_size);
668 GUARD_ARRAY (thedst_neon, loop * vec_size);
670 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
671 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
673 assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
674 assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
676 for (pos = 0; pos < loop; pos++)
680 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
681 for (i = 0; i < vec_size; i++)
683 fprintf (stdout,
"thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
684 fprintf (stdout,
"thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
687 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_LARGE, vec_size);
693 free (guarded_dst_c);
694 free (guarded_dst_neon);
697 #ifdef PERFORMANCE_TEST 698 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time (micro-s)",
"NEON Time (micro-s)",
"Time Savings",
"Performance Ratio");
701 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
702 NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length);
705 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
706 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
711 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
714 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
717 time_savings = ( ( (
ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
718 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
721 free (perftest_guarded_src1);
722 free (perftest_guarded_src2);
723 free (perftest_guarded_dst_c);
724 free (perftest_guarded_dst_neon);
727 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
728 #undef MAX_VEC_COMPONENTS 731 void test_dot_case0()
733 #define MAX_VEC_COMPONENTS 4 737 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
748 #if defined (SMOKE_TEST)||(REGRESSION_TEST) 753 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
754 NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length);
757 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
758 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
762 for (loop = 0; loop < TEST_ITERATION; loop++)
768 GUARD_ARRAY (thedst_c, loop);
769 GUARD_ARRAY (thedst_neon, loop);
771 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
772 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
774 assert_true (CHECK_ARRAY_GUARD (thedst_c, loop));
775 assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop));
777 for (pos = 0; pos < loop; pos++)
781 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
782 for (i = 0; i < vec_size; i++)
784 fprintf (stdout,
"thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
785 fprintf (stdout,
"thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
788 assert_float_vec_equal (&thedst_c[pos], &thedst_neon[pos], ERROR_MARGIN_SMALL, 1);
794 free (guarded_dst_c);
795 free (guarded_dst_neon);
798 #ifdef PERFORMANCE_TEST 799 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time (micro-s)",
"NEON Time (micro-s)",
"Time Savings",
"Performance Ratio");
802 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
803 NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length);
806 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
807 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
812 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
815 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
818 time_savings = ( ( (
ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
819 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
822 free (perftest_guarded_src1);
823 free (perftest_guarded_src2);
824 free (perftest_guarded_dst_c);
825 free (perftest_guarded_dst_neon);
828 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
829 #undef MAX_VEC_COMPONENTS 832 void test_len_case0()
834 #define MAX_VEC_COMPONENTS 4 847 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
849 #if defined (SMOKE_TEST)||(REGRESSION_TEST) 855 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
858 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
859 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
863 for (loop = 0; loop < TEST_ITERATION; loop++)
865 vec_size = func_loop + 1;
867 GUARD_ARRAY (thedst_c, loop);
868 GUARD_ARRAY (thedst_neon, loop);
870 ftbl_3args[2 * func_loop] (thedst_c, thesrc1, loop);
871 ftbl_3args[2 * func_loop + 1] (thedst_neon, thesrc1, loop);
873 assert_true (CHECK_ARRAY_GUARD (thedst_c, loop));
874 assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop));
876 for (pos = 0; pos < loop; pos++)
880 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
881 for (i = 0; i < vec_size; i++)
883 fprintf (stdout,
"thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
886 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_LARGE, 1);
891 free (guarded_dst_c);
892 free (guarded_dst_neon);
895 #ifdef PERFORMANCE_TEST 896 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time (micro-s)",
"NEON Time (micro-s)",
"Time Savings",
"Performance Ratio");
899 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
902 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
903 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
908 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_3args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, loop);
911 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_3args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, loop);
914 time_savings = ( ( (
ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
915 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
918 free (perftest_guarded_src1);
919 free (perftest_guarded_dst_c);
920 free (perftest_guarded_dst_neon);
923 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
924 #undef MAX_VEC_COMPONENTS 927 void test_mlac_case0()
929 #define MAX_VEC_COMPONENTS 4 933 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
947 #if defined (SMOKE_TEST)||(REGRESSION_TEST) 953 NE10_SRC_ALLOC_LIMIT (theacc, guarded_acc, fixed_length);
954 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
955 NE10_SRC_ALLOC_LIMIT (thecst, guarded_cst, MAX_VEC_COMPONENTS);
958 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
959 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
963 for (loop = 0; loop < TEST_ITERATION; loop++)
965 vec_size = func_loop + 1;
967 GUARD_ARRAY (thedst_c, loop * vec_size);
968 GUARD_ARRAY (thedst_neon, loop * vec_size);
972 ftbl_5args_cst[2 * func_loop] (thedst_c, theacc, thesrc1, thecst[0], loop);
973 ftbl_5args_cst[2 * func_loop + 1] (thedst_neon, theacc, thesrc1, thecst[0], loop);
977 ftbl_5args[2 * func_loop] (thedst_c, theacc, thesrc1, thecst, loop);
978 ftbl_5args[2 * func_loop + 1] (thedst_neon, theacc, thesrc1, thecst, loop);
981 assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
982 assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
984 for (pos = 0; pos < loop; pos++)
988 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
989 for (i = 0; i < vec_size; i++)
991 fprintf (stdout,
"theacc->%d: %f [0x%04X] \n", i, theacc[pos * vec_size + i], * (
ne10_uint32_t*) &theacc[pos * vec_size + i]);
992 fprintf (stdout,
"thesrc->%d: %f [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
993 fprintf (stdout,
"thecst->%d: %f [0x%04X] \n", i, thecst[i], * (
ne10_uint32_t*) &thecst[i]);
996 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
1001 free (guarded_src1);
1003 free (guarded_dst_c);
1004 free (guarded_dst_neon);
1007 #ifdef PERFORMANCE_TEST 1008 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time (micro-s)",
"NEON Time (micro-s)",
"Time Savings",
"Performance Ratio");
1011 NE10_SRC_ALLOC_LIMIT (perftest_theacc, perftest_guarded_acc, perftest_length);
1012 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
1013 NE10_SRC_ALLOC_LIMIT (perftest_thecst, perftest_guarded_cst, MAX_VEC_COMPONENTS);
1016 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1017 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1019 for (func_loop = 0; func_loop < 1; func_loop++)
1022 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_5args_cst[2 * func_loop] (perftest_thedst_c, perftest_theacc, perftest_thesrc1, perftest_thecst[0], loop);
1024 GET_TIME (time_neon,
1025 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_5args_cst[2 * func_loop + 1] (perftest_thedst_neon, perftest_theacc, perftest_thesrc1, perftest_thecst[0], loop);
1028 time_savings = ( ( (
ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1029 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1034 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_5args[2 * func_loop] (perftest_thedst_c, perftest_theacc, perftest_thesrc1, perftest_thecst, loop);
1036 GET_TIME (time_neon,
1037 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_5args[2 * func_loop + 1] (perftest_thedst_neon, perftest_theacc, perftest_thesrc1, perftest_thecst, loop);
1040 time_savings = ( ( (
ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1041 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1044 free (perftest_guarded_acc);
1045 free (perftest_guarded_src1);
1046 free (perftest_guarded_cst);
1047 free (perftest_guarded_dst_c);
1048 free (perftest_guarded_dst_neon);
1051 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
1052 #undef MAX_VEC_COMPONENTS 1055 void test_mla_case0()
1057 #define MAX_VEC_COMPONENTS 4 1061 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
1074 #if defined (SMOKE_TEST)||(REGRESSION_TEST) 1080 NE10_SRC_ALLOC_LIMIT (theacc, guarded_acc, fixed_length);
1081 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
1082 NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length);
1085 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1086 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1090 for (loop = 0; loop < TEST_ITERATION; loop++)
1092 vec_size = func_loop + 1;
1094 GUARD_ARRAY (thedst_c, loop * vec_size);
1095 GUARD_ARRAY (thedst_neon, loop * vec_size);
1097 ftbl_5args[2 * func_loop] (thedst_c, theacc, thesrc1, thesrc2, loop);
1098 ftbl_5args[2 * func_loop + 1] (thedst_neon, theacc, thesrc1, thesrc2, loop);
1100 assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
1101 assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
1103 for (pos = 0; pos < loop; pos++)
1107 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1108 for (i = 0; i < vec_size; i++)
1110 fprintf (stdout,
"theacc->%d: %e [0x%04X] \n", i, theacc[pos * vec_size + i], * (
ne10_uint32_t*) &theacc[pos * vec_size + i]);
1111 fprintf (stdout,
"thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
1112 fprintf (stdout,
"thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
1115 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
1120 free (guarded_src1);
1121 free (guarded_src2);
1122 free (guarded_dst_c);
1123 free (guarded_dst_neon);
1126 #ifdef PERFORMANCE_TEST 1127 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time (micro-s)",
"NEON Time (micro-s)",
"Time Savings",
"Performance Ratio");
1130 NE10_SRC_ALLOC_LIMIT (perftest_theacc, perftest_guarded_acc, perftest_length);
1131 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
1132 NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length);
1135 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1136 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1141 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_5args[2 * func_loop] (perftest_thedst_c, perftest_theacc, perftest_thesrc1, perftest_thesrc2, loop);
1143 GET_TIME (time_neon,
1144 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_5args[2 * func_loop + 1] (perftest_thedst_neon, perftest_theacc, perftest_thesrc1, perftest_thesrc2, loop);
1147 time_savings = ( ( (
ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1148 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1151 free (perftest_guarded_acc);
1152 free (perftest_guarded_src1);
1153 free (perftest_guarded_src2);
1154 free (perftest_guarded_dst_c);
1155 free (perftest_guarded_dst_neon);
1158 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
1159 #undef MAX_VEC_COMPONENTS 1162 void test_mulc_case0()
1164 #define MAX_VEC_COMPONENTS 4 1168 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
1182 #if defined (SMOKE_TEST)||(REGRESSION_TEST) 1188 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
1189 NE10_SRC_ALLOC_LIMIT (thecst, guarded_cst, MAX_VEC_COMPONENTS);
1192 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1193 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1197 for (loop = 0; loop < TEST_ITERATION; loop++)
1199 vec_size = func_loop + 1;
1201 GUARD_ARRAY (thedst_c, loop * vec_size);
1202 GUARD_ARRAY (thedst_neon, loop * vec_size);
1206 ftbl_4args_cst[2 * func_loop] (thedst_c, thesrc1, thecst[0], loop);
1207 ftbl_4args_cst[2 * func_loop + 1] (thedst_neon, thesrc1, thecst[0], loop);
1211 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thecst, loop);
1212 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thecst, loop);
1215 assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
1216 assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
1218 for (pos = 0; pos < loop; pos++)
1222 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1223 for (i = 0; i < vec_size; i++)
1225 fprintf (stdout,
"thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
1226 fprintf (stdout,
"thecst->%d: %e [0x%04X] \n", i, thecst[i], * (
ne10_uint32_t*) &thecst[i]);
1229 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
1233 free (guarded_src1);
1235 free (guarded_dst_c);
1236 free (guarded_dst_neon);
1239 #ifdef PERFORMANCE_TEST 1240 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time (micro-s)",
"NEON Time (micro-s)",
"Time Savings",
"Performance Ratio");
1243 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
1244 NE10_SRC_ALLOC_LIMIT (perftest_thecst, perftest_guarded_cst, MAX_VEC_COMPONENTS);
1247 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1248 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1250 for (func_loop = 0; func_loop < 1; func_loop++)
1253 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args_cst[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst[0], loop);
1255 GET_TIME (time_neon,
1256 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args_cst[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst[0], loop);
1259 time_savings = ( ( (
ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1260 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1265 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst, loop);
1267 GET_TIME (time_neon,
1268 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst, loop);
1271 time_savings = ( ( (
ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1272 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1275 free (perftest_guarded_src1);
1276 free (perftest_guarded_cst);
1277 free (perftest_guarded_dst_c);
1278 free (perftest_guarded_dst_neon);
1281 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
1282 #undef MAX_VEC_COMPONENTS 1285 void test_mul_case0()
1287 #define MAX_VEC_COMPONENTS 4 1291 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
1304 #if defined (SMOKE_TEST)||(REGRESSION_TEST) 1310 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
1311 NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length);
1314 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1315 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1319 for (loop = 0; loop < TEST_ITERATION; loop++)
1321 vec_size = func_loop + 1;
1323 GUARD_ARRAY (thedst_c, loop * vec_size);
1324 GUARD_ARRAY (thedst_neon, loop * vec_size);
1326 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
1327 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
1329 assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
1330 assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
1332 for (pos = 0; pos < loop; pos++)
1336 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1337 for (i = 0; i < vec_size; i++)
1339 fprintf (stdout,
"thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
1340 fprintf (stdout,
"thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
1343 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
1347 free (guarded_src1);
1348 free (guarded_src2);
1349 free (guarded_dst_c);
1350 free (guarded_dst_neon);
1353 #ifdef PERFORMANCE_TEST 1354 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time (micro-s)",
"NEON Time (micro-s)",
"Time Savings",
"Performance Ratio");
1357 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
1358 NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length);
1361 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1362 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1367 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
1369 GET_TIME (time_neon,
1370 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
1373 time_savings = ( ( (
ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1374 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1377 free (perftest_guarded_src1);
1378 free (perftest_guarded_src2);
1379 free (perftest_guarded_dst_c);
1380 free (perftest_guarded_dst_neon);
1383 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
1384 #undef MAX_VEC_COMPONENTS 1387 void test_normalize_case0()
1389 #define MAX_VEC_COMPONENTS 4 1402 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
1404 #if defined (SMOKE_TEST)||(REGRESSION_TEST) 1410 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
1413 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1414 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1418 for (loop = 0; loop < TEST_ITERATION; loop++)
1420 vec_size = func_loop + 1;
1422 GUARD_ARRAY (thedst_c, loop * vec_size);
1423 GUARD_ARRAY (thedst_neon, loop * vec_size);
1425 ftbl_3args[2 * func_loop] (thedst_c, thesrc1, loop);
1426 ftbl_3args[2 * func_loop + 1] (thedst_neon, thesrc1, loop);
1428 assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
1429 assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
1431 for (pos = 0; pos < loop; pos++)
1435 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1436 for (i = 0; i < vec_size; i++)
1438 fprintf (stdout,
"thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
1441 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_LARGE, vec_size);
1445 free (guarded_src1);
1446 free (guarded_dst_c);
1447 free (guarded_dst_neon);
1450 #ifdef PERFORMANCE_TEST 1451 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time (micro-s)",
"NEON Time (micro-s)",
"Time Savings",
"Performance Ratio");
1454 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
1457 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1458 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1463 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_3args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, loop);
1465 GET_TIME (time_neon,
1466 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_3args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, loop);
1469 time_savings = ( ( (
ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1470 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1473 free (perftest_guarded_src1);
1474 free (perftest_guarded_dst_c);
1475 free (perftest_guarded_dst_neon);
1478 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
1479 #undef MAX_VEC_COMPONENTS 1482 void test_rsbc_case0()
1484 #define MAX_VEC_COMPONENTS 4 1488 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
1502 #if defined (SMOKE_TEST)||(REGRESSION_TEST) 1508 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
1509 NE10_SRC_ALLOC_LIMIT (thecst, guarded_cst, MAX_VEC_COMPONENTS);
1512 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1513 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1517 for (loop = 0; loop < TEST_ITERATION; loop++)
1519 vec_size = func_loop + 1;
1521 GUARD_ARRAY (thedst_c, loop * vec_size);
1522 GUARD_ARRAY (thedst_neon, loop * vec_size);
1526 ftbl_4args_cst[2 * func_loop] (thedst_c, thesrc1, thecst[0], loop);
1527 ftbl_4args_cst[2 * func_loop + 1] (thedst_neon, thesrc1, thecst[0], loop);
1531 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thecst, loop);
1532 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thecst, loop);
1535 assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
1536 assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
1538 for (pos = 0; pos < loop; pos++)
1542 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1543 for (i = 0; i < vec_size; i++)
1545 fprintf (stdout,
"thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
1546 fprintf (stdout,
"thecst->%d: %e [0x%04X] \n", i, thecst[i], * (
ne10_uint32_t*) &thecst[i]);
1549 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
1553 free (guarded_src1);
1555 free (guarded_dst_c);
1556 free (guarded_dst_neon);
1559 #ifdef PERFORMANCE_TEST 1560 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time (micro-s)",
"NEON Time (micro-s)",
"Time Savings",
"Performance Ratio");
1563 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
1564 NE10_SRC_ALLOC_LIMIT (perftest_thecst, perftest_guarded_cst, MAX_VEC_COMPONENTS);
1567 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1568 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1570 for (func_loop = 0; func_loop < 1; func_loop++)
1573 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args_cst[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst[0], loop);
1575 GET_TIME (time_neon,
1576 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args_cst[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst[0], loop);
1579 time_savings = ( ( (
ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1580 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1585 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst, loop);
1587 GET_TIME (time_neon,
1588 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst, loop);
1591 time_savings = ( ( (
ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1592 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1595 free (perftest_guarded_src1);
1596 free (perftest_guarded_cst);
1597 free (perftest_guarded_dst_c);
1598 free (perftest_guarded_dst_neon);
1601 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
1602 #undef MAX_VEC_COMPONENTS 1605 void test_setc_case0()
1607 #define MAX_VEC_COMPONENTS 4 1611 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
1625 #if defined (SMOKE_TEST)||(REGRESSION_TEST) 1631 NE10_SRC_ALLOC_LIMIT (thecst, guarded_cst, MAX_VEC_COMPONENTS);
1634 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1635 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1639 for (loop = 0; loop < TEST_ITERATION; loop++)
1641 vec_size = func_loop + 1;
1643 GUARD_ARRAY (thedst_c, loop * vec_size);
1644 GUARD_ARRAY (thedst_neon, loop * vec_size);
1649 ftbl_3args_cst[2 * func_loop + 1] (thedst_neon, thecst[0], loop);
1653 ftbl_3args[2 * func_loop] (thedst_c, thecst, loop);
1654 ftbl_3args[2 * func_loop + 1] (thedst_neon, thecst, loop);
1657 assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
1658 assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
1660 for (pos = 0; pos < loop; pos++)
1664 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1665 for (i = 0; i < vec_size; i++)
1667 fprintf (stdout,
"thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
1668 fprintf (stdout,
"thecst->%d: %e [0x%04X] \n", i, thecst[i], * (
ne10_uint32_t*) &thecst[i]);
1671 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
1676 free (guarded_dst_c);
1677 free (guarded_dst_neon);
1680 #ifdef PERFORMANCE_TEST 1681 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time (micro-s)",
"NEON Time (micro-s)",
"Time Savings",
"Performance Ratio");
1684 NE10_SRC_ALLOC_LIMIT (perftest_thecst, perftest_guarded_cst, MAX_VEC_COMPONENTS);
1687 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1688 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1690 for (func_loop = 0; func_loop < 1; func_loop++)
1693 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_3args_cst[2 * func_loop] (perftest_thedst_c, perftest_thecst[0], loop);
1695 GET_TIME (time_neon,
1696 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_3args_cst[2 * func_loop + 1] (perftest_thedst_neon, perftest_thecst[0], loop);
1699 time_savings = ( ( (
ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1700 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1705 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_3args[2 * func_loop] (perftest_thedst_c, perftest_thecst, loop);
1707 GET_TIME (time_neon,
1708 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_3args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thecst, loop);
1711 time_savings = ( ( (
ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1712 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1715 free (perftest_guarded_cst);
1716 free (perftest_guarded_dst_c);
1717 free (perftest_guarded_dst_neon);
1720 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
1721 #undef MAX_VEC_COMPONENTS 1724 void test_subc_case0()
1726 #define MAX_VEC_COMPONENTS 4 1730 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
1744 #if defined (SMOKE_TEST)||(REGRESSION_TEST) 1750 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
1751 NE10_SRC_ALLOC_LIMIT (thecst, guarded_cst, MAX_VEC_COMPONENTS);
1754 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1755 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1759 for (loop = 0; loop < TEST_ITERATION; loop++)
1761 vec_size = func_loop + 1;
1763 GUARD_ARRAY (thedst_c, loop * vec_size);
1764 GUARD_ARRAY (thedst_neon, loop * vec_size);
1768 ftbl_4args_cst[2 * func_loop] (thedst_c, thesrc1, thecst[0], loop);
1769 ftbl_4args_cst[2 * func_loop + 1] (thedst_neon, thesrc1, thecst[0], loop);
1773 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thecst, loop);
1774 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thecst, loop);
1777 assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
1778 assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
1780 for (pos = 0; pos < loop; pos++)
1784 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1785 for (i = 0; i < vec_size; i++)
1787 fprintf (stdout,
"thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
1788 fprintf (stdout,
"thecst->%d: %e [0x%04X] \n", i, thecst[i], * (
ne10_uint32_t*) &thecst[i]);
1791 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
1795 free (guarded_src1);
1797 free (guarded_dst_c);
1798 free (guarded_dst_neon);
1801 #ifdef PERFORMANCE_TEST 1802 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time (micro-s)",
"NEON Time (micro-s)",
"Time Savings",
"Performance Ratio");
1805 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
1806 NE10_SRC_ALLOC_LIMIT (perftest_thecst, perftest_guarded_cst, MAX_VEC_COMPONENTS);
1809 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1810 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1812 for (func_loop = 0; func_loop < 1; func_loop++)
1815 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args_cst[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst[0], loop);
1817 GET_TIME (time_neon,
1818 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args_cst[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst[0], loop);
1821 time_savings = ( ( (
ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1822 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1827 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst, loop);
1829 GET_TIME (time_neon,
1830 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst, loop);
1833 time_savings = ( ( (
ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1834 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1837 free (perftest_guarded_src1);
1838 free (perftest_guarded_cst);
1839 free (perftest_guarded_dst_c);
1840 free (perftest_guarded_dst_neon);
1843 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
1844 #undef MAX_VEC_COMPONENTS 1847 void test_sub_case0()
1849 #define MAX_VEC_COMPONENTS 4 1853 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
1866 #if defined (SMOKE_TEST)||(REGRESSION_TEST) 1872 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
1873 NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length);
1876 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1877 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1881 for (loop = 0; loop < TEST_ITERATION; loop++)
1883 vec_size = func_loop + 1;
1885 GUARD_ARRAY (thedst_c, loop * vec_size);
1886 GUARD_ARRAY (thedst_neon, loop * vec_size);
1888 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
1889 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
1891 assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
1892 assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
1894 for (pos = 0; pos < loop; pos++)
1898 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1899 for (i = 0; i < vec_size; i++)
1901 fprintf (stdout,
"thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
1902 fprintf (stdout,
"thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
1905 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
1909 free (guarded_src1);
1910 free (guarded_src2);
1911 free (guarded_dst_c);
1912 free (guarded_dst_neon);
1915 #ifdef PERFORMANCE_TEST 1916 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time (micro-s)",
"NEON Time (micro-s)",
"Time Savings",
"Performance Ratio");
1919 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
1920 NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length);
1923 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1924 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1929 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
1931 GET_TIME (time_neon,
1932 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
1935 time_savings = ( ( (
ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1936 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1939 free (perftest_guarded_src1);
1940 free (perftest_guarded_src2);
1941 free (perftest_guarded_dst_c);
1942 free (perftest_guarded_dst_neon);
1945 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
1946 #undef MAX_VEC_COMPONENTS 1949 void test_addmat_case0()
1951 #define MAX_VEC_COMPONENTS 4 1955 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
1966 #if defined (SMOKE_TEST)||(REGRESSION_TEST) 1972 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
1973 NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length);
1976 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1977 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1981 for (loop = 0; loop < TEST_ITERATION; loop++)
1983 vec_size = (func_loop + 1) * (func_loop + 1);
1985 GUARD_ARRAY (thedst_c, loop * vec_size);
1986 GUARD_ARRAY (thedst_neon, loop * vec_size);
1988 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
1989 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
1991 assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
1992 assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
1994 for (pos = 0; pos < loop; pos++)
1998 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1999 for (i = 0; i < vec_size; i++)
2001 fprintf (stdout,
"thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
2002 fprintf (stdout,
"thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
2005 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
2009 free (guarded_src1);
2010 free (guarded_src2);
2011 free (guarded_dst_c);
2012 free (guarded_dst_neon);
2015 #ifdef PERFORMANCE_TEST 2016 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time (micro-s)",
"NEON Time (micro-s)",
"Time Savings",
"Performance Ratio");
2019 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
2020 NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length);
2023 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
2024 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
2029 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
2031 GET_TIME (time_neon,
2032 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
2035 time_savings = ( ( (
ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
2036 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
2039 free (perftest_guarded_src1);
2040 free (perftest_guarded_src2);
2041 free (perftest_guarded_dst_c);
2042 free (perftest_guarded_dst_neon);
2045 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
2046 #undef MAX_VEC_COMPONENTS 2049 void test_detmat_case0()
2051 #define MAX_VEC_COMPONENTS 4 2055 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
2066 #if defined (SMOKE_TEST)||(REGRESSION_TEST) 2072 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
2075 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
2076 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
2080 for (loop = 0; loop < TEST_ITERATION; loop++)
2082 vec_size = (func_loop + 1) * (func_loop + 1);
2084 GUARD_ARRAY (thedst_c, loop);
2085 GUARD_ARRAY (thedst_neon, loop);
2087 ftbl_3args[2 * func_loop] (thedst_c, thesrc1, loop);
2088 ftbl_3args[2 * func_loop + 1] (thedst_neon, thesrc1, loop);
2090 assert_true (CHECK_ARRAY_GUARD (thedst_c, loop));
2091 assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop));
2093 for (pos = 0; pos < loop; pos++)
2097 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
2098 for (i = 0; i < vec_size; i++)
2100 fprintf (stdout,
"thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
2103 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, 1);
2107 free (guarded_src1);
2108 free (guarded_dst_c);
2109 free (guarded_dst_neon);
2112 #ifdef PERFORMANCE_TEST 2113 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time (micro-s)",
"NEON Time (micro-s)",
"Time Savings",
"Performance Ratio");
2116 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
2119 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
2120 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
2125 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_3args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, loop);
2127 GET_TIME (time_neon,
2128 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_3args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, loop);
2131 time_savings = ( ( (
ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
2132 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
2135 free (perftest_guarded_src1);
2136 free (perftest_guarded_dst_c);
2137 free (perftest_guarded_dst_neon);
2140 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
2141 #undef MAX_VEC_COMPONENTS 2144 void test_identitymat_case0()
2146 #define MAX_VEC_COMPONENTS 4 2150 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
2161 #if defined (SMOKE_TEST)||(REGRESSION_TEST) 2167 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
2168 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
2172 for (loop = 0; loop < TEST_ITERATION; loop++)
2174 vec_size = (func_loop + 1) * (func_loop + 1);
2176 GUARD_ARRAY (thedst_c, loop * vec_size);
2177 GUARD_ARRAY (thedst_neon, loop * vec_size);
2180 ftbl_2args[2 * func_loop + 1] (thedst_neon, loop);
2182 assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
2183 assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
2185 for (pos = 0; pos < loop; pos++)
2188 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
2190 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
2194 free (guarded_dst_c);
2195 free (guarded_dst_neon);
2198 #ifdef PERFORMANCE_TEST 2199 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time (micro-s)",
"NEON Time (micro-s)",
"Time Savings",
"Performance Ratio");
2202 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
2203 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
2208 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_2args[2 * func_loop] (perftest_thedst_c, loop);
2210 GET_TIME (time_neon,
2211 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_2args[2 * func_loop + 1] (perftest_thedst_neon, loop);
2214 time_savings = ( ( (
ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
2215 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
2218 free (perftest_guarded_dst_c);
2219 free (perftest_guarded_dst_neon);
2222 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
2223 #undef MAX_VEC_COMPONENTS 2226 void test_invmat_case0()
2228 #define MAX_VEC_COMPONENTS 4 2232 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
2243 #if defined (SMOKE_TEST)||(REGRESSION_TEST) 2249 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
2252 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
2253 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
2257 for (loop = 0; loop < TEST_ITERATION; loop++)
2259 vec_size = (func_loop + 1) * (func_loop + 1);
2261 GUARD_ARRAY (thedst_c, loop * vec_size);
2262 GUARD_ARRAY (thedst_neon, loop * vec_size);
2264 ftbl_3args[2 * func_loop] (thedst_c, thesrc1, loop);
2265 ftbl_3args[2 * func_loop + 1] (thedst_neon, thesrc1, loop);
2267 assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
2268 assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
2270 for (pos = 0; pos < loop; pos++)
2274 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
2275 for (i = 0; i < vec_size; i++)
2277 fprintf (stdout,
"thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
2280 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_LARGE, vec_size);
2284 free (guarded_src1);
2285 free (guarded_dst_c);
2286 free (guarded_dst_neon);
2289 #ifdef PERFORMANCE_TEST 2290 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time (micro-s)",
"NEON Time (micro-s)",
"Time Savings",
"Performance Ratio");
2293 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
2296 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
2297 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
2302 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_3args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, loop);
2304 GET_TIME (time_neon,
2305 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_3args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, loop);
2308 time_savings = ( ( (
ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
2309 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
2312 free (perftest_guarded_src1);
2313 free (perftest_guarded_dst_c);
2314 free (perftest_guarded_dst_neon);
2317 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
2318 #undef MAX_VEC_COMPONENTS 2321 void test_mulmat_case0()
2323 #define MAX_VEC_COMPONENTS 4 2327 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
2338 #if defined (SMOKE_TEST)||(REGRESSION_TEST) 2344 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
2345 NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length);
2348 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
2349 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
2353 for (loop = 0; loop < TEST_ITERATION; loop++)
2355 vec_size = (func_loop + 1) * (func_loop + 1);
2357 GUARD_ARRAY (thedst_c, loop * vec_size);
2358 GUARD_ARRAY (thedst_neon, loop * vec_size);
2360 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
2361 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
2363 assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
2364 assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
2366 for (pos = 0; pos < loop; pos++)
2370 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
2371 for (i = 0; i < vec_size; i++)
2373 fprintf (stdout,
"thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
2374 fprintf (stdout,
"thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
2377 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
2381 free (guarded_src1);
2382 free (guarded_src2);
2383 free (guarded_dst_c);
2384 free (guarded_dst_neon);
2387 #ifdef PERFORMANCE_TEST 2388 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time (micro-s)",
"NEON Time (micro-s)",
"Time Savings",
"Performance Ratio");
2391 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
2392 NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length);
2395 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
2396 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
2401 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
2403 GET_TIME (time_neon,
2404 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
2407 time_savings = ( ( (
ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
2408 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
2411 free (perftest_guarded_src1);
2412 free (perftest_guarded_src2);
2413 free (perftest_guarded_dst_c);
2414 free (perftest_guarded_dst_neon);
2417 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
2418 #undef MAX_VEC_COMPONENTS 2421 void test_submat_case0()
2423 #define MAX_VEC_COMPONENTS 4 2427 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
2438 #if defined (SMOKE_TEST)||(REGRESSION_TEST) 2444 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
2445 NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length);
2448 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
2449 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
2453 for (loop = 0; loop < TEST_ITERATION; loop++)
2455 vec_size = (func_loop + 1) * (func_loop + 1);
2457 GUARD_ARRAY (thedst_c, loop * vec_size);
2458 GUARD_ARRAY (thedst_neon, loop * vec_size);
2460 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
2461 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
2463 assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
2464 assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
2466 for (pos = 0; pos < loop; pos++)
2470 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
2471 for (i = 0; i < vec_size; i++)
2473 fprintf (stdout,
"thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
2474 fprintf (stdout,
"thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
2477 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
2481 free (guarded_src1);
2482 free (guarded_src2);
2483 free (guarded_dst_c);
2484 free (guarded_dst_neon);
2487 #ifdef PERFORMANCE_TEST 2488 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time (micro-s)",
"NEON Time (micro-s)",
"Time Savings",
"Performance Ratio");
2491 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
2492 NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length);
2495 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
2496 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
2501 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
2503 GET_TIME (time_neon,
2504 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
2507 time_savings = ( ( (
ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
2508 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
2511 free (perftest_guarded_src1);
2512 free (perftest_guarded_src2);
2513 free (perftest_guarded_dst_c);
2514 free (perftest_guarded_dst_neon);
2517 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
2518 #undef MAX_VEC_COMPONENTS 2521 void test_transmat_case0()
2523 #define MAX_VEC_COMPONENTS 4 2527 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
2538 #if defined (SMOKE_TEST)||(REGRESSION_TEST) 2544 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
2547 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
2548 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
2552 for (loop = 0; loop < TEST_ITERATION; loop++)
2554 vec_size = (func_loop + 1) * (func_loop + 1);
2556 GUARD_ARRAY (thedst_c, loop * vec_size);
2557 GUARD_ARRAY (thedst_neon, loop * vec_size);
2559 ftbl_3args[2 * func_loop] (thedst_c, thesrc1, loop);
2560 ftbl_3args[2 * func_loop + 1] (thedst_neon, thesrc1, loop);
2562 assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
2563 assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
2565 for (pos = 0; pos < loop; pos++)
2569 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
2570 for (i = 0; i < vec_size; i++)
2572 fprintf (stdout,
"thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
2575 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
2579 free (guarded_src1);
2580 free (guarded_dst_c);
2581 free (guarded_dst_neon);
2584 #ifdef PERFORMANCE_TEST 2585 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time (micro-s)",
"NEON Time (micro-s)",
"Time Savings",
"Performance Ratio");
2588 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
2591 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
2592 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
2597 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_3args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, loop);
2599 GET_TIME (time_neon,
2600 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_3args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, loop);
2603 time_savings = ( ( (
ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
2604 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
2607 free (perftest_guarded_src1);
2608 free (perftest_guarded_dst_c);
2609 free (perftest_guarded_dst_neon);
2612 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
2613 #undef MAX_VEC_COMPONENTS 2616 void test_mulcmatvec_case0()
2618 #define MAX_VEC_COMPONENTS 4 2622 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
2633 #if defined (SMOKE_TEST)||(REGRESSION_TEST) 2639 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
2640 NE10_SRC_ALLOC_LIMIT (thecst, guarded_cst, MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS);
2643 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
2644 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
2648 for (loop = 0; loop < TEST_ITERATION; loop++)
2650 vec_size = func_loop + 1;
2652 GUARD_ARRAY (thedst_c, loop * vec_size);
2653 GUARD_ARRAY (thedst_neon, loop * vec_size);
2655 ftbl_4args[2 * func_loop] (thedst_c, thecst, thesrc1, loop);
2656 ftbl_4args[2 * func_loop + 1] (thedst_neon, thecst, thesrc1, loop);
2658 assert_true (CHECK_ARRAY_GUARD (thedst_c, loop * vec_size));
2659 assert_true (CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size));
2661 for (pos = 0; pos < loop; pos++)
2665 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
2666 for (i = 0; i < vec_size * vec_size; i++)
2668 fprintf (stdout,
"thecst->%d: %e [0x%04X] \n", i, thecst[i], * (
ne10_uint32_t*) &thecst[i]);
2670 for (i = 0; i < vec_size; i++)
2672 fprintf (stdout,
"thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (
ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
2675 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
2679 free (guarded_src1);
2681 free (guarded_dst_c);
2682 free (guarded_dst_neon);
2685 #ifdef PERFORMANCE_TEST 2686 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time (micro-s)",
"NEON Time (micro-s)",
"Time Savings",
"Performance Ratio");
2689 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
2690 NE10_SRC_ALLOC_LIMIT (perftest_thecst, perftest_guarded_cst, MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS);
2693 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
2694 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
2699 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thecst, perftest_thesrc1, loop);
2701 GET_TIME (time_neon,
2702 for (loop = 0; loop < PERF_TEST_ITERATION; loop++)
ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thecst, perftest_thesrc1, loop);
2705 time_savings = ( ( (
ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
2706 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
2709 free (perftest_guarded_src1);
2710 free (perftest_guarded_cst);
2711 free (perftest_guarded_dst_c);
2712 free (perftest_guarded_dst_neon);
2715 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
2716 #undef MAX_VEC_COMPONENTS 2778 void test_normalize()
2780 test_normalize_case0();
2805 test_addmat_case0();
2810 test_detmat_case0();
2813 void test_identitymat()
2815 test_identitymat_case0();
2820 test_invmat_case0();
2825 test_mulmat_case0();
2828 void test_mulcmatvec()
2830 test_mulcmatvec_case0();
2835 test_submat_case0();
2838 void test_transmat()
2840 test_transmat_case0();
2846 ne10_log_buffer_ptr = ne10_log_buffer;
2854 void test_fixture_math (
void)
2856 test_fixture_start();
2861 run_test (test_abs);
2862 run_test (test_addc);
2863 run_test (test_add);
2864 run_test (test_cross);
2865 run_test (test_divc);
2866 run_test (test_div);
2867 run_test (test_dot);
2868 run_test (test_len);
2869 run_test (test_mlac);
2870 run_test (test_mla);
2871 run_test (test_mulc);
2872 run_test (test_mul);
2873 run_test (test_normalize);
2874 run_test (test_rsbc);
2875 run_test (test_setc);
2876 run_test (test_subc);
2877 run_test (test_sub);
2878 run_test (test_addmat);
2879 run_test (test_detmat);
2880 run_test (test_identitymat);
2881 run_test (test_invmat);
2882 run_test (test_mulmat);
2883 run_test (test_mulcmatvec);
2884 run_test (test_submat);
2885 run_test (test_transmat);
ne10_result_t ne10_mlac_vec2f_c(ne10_vec2f_t *dst, ne10_vec2f_t *acc, ne10_vec2f_t *src, const ne10_vec2f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_mlac_vec2f using plain C code.
ne10_result_t ne10_len_vec3f_c(ne10_float32_t *dst, ne10_vec3f_t *src, ne10_uint32_t count)
Specific implementation of ne10_len_vec3f using plain C code.
ne10_result_t ne10_addc_vec4f_neon(ne10_vec4f_t *dst, ne10_vec4f_t *src, const ne10_vec4f_t *cst, ne10_uint32_t count) asm("ne10_addc_vec4f_neon")
Specific implementation of ne10_addc_vec4f using NEON intrinsics.
ne10_result_t ne10_addmat_3x3f_neon(ne10_mat3x3f_t *dst, ne10_mat3x3f_t *src1, ne10_mat3x3f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_addmat_3x3f using NEON intrinsics.
ne10_result_t ne10_subc_vec3f_neon(ne10_vec3f_t *dst, ne10_vec3f_t *src, const ne10_vec3f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_subc_vec3f using NEON intrinsics.
ne10_result_t ne10_abs_vec4f_neon(ne10_vec4f_t *dst, ne10_vec4f_t *src, ne10_uint32_t count) asm("ne10_abs_vec4f_neon")
Specific implementation of ne10_abs_vec4f using NEON intrinsics.
ne10_result_t ne10_normalize_vec3f_neon(ne10_vec3f_t *dst, ne10_vec3f_t *src, ne10_uint32_t count) asm("ne10_normalize_vec3f_neon")
Specific implementation of ne10_normalize_vec3f using NEON intrinsics.
ne10_result_t ne10_addc_vec3f_neon(ne10_vec3f_t *dst, ne10_vec3f_t *src, const ne10_vec3f_t *cst, ne10_uint32_t count) asm("ne10_addc_vec3f_neon")
Specific implementation of ne10_addc_vec3f using NEON intrinsics.
ne10_result_t ne10_mulc_vec2f_c(ne10_vec2f_t *dst, ne10_vec2f_t *src, const ne10_vec2f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_mulc_vec2f using plain C code.
ne10_result_t ne10_mlac_vec4f_c(ne10_vec4f_t *dst, ne10_vec4f_t *acc, ne10_vec4f_t *src, const ne10_vec4f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_mlac_vec4f using plain C code.
ne10_result_t ne10_cross_vec3f_c(ne10_vec3f_t *dst, ne10_vec3f_t *src1, ne10_vec3f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_cross_vec3f using plain C code.
ne10_result_t ne10_abs_vec2f_neon(ne10_vec2f_t *dst, ne10_vec2f_t *src, ne10_uint32_t count) asm("ne10_abs_vec2f_neon")
Specific implementation of ne10_abs_vec2f using NEON intrinsics.
ne10_result_t ne10_divc_float_c(ne10_float32_t *dst, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count)
Specific implementation of ne10_divc_float using plain C code.
ne10_result_t ne10_dot_vec4f_neon(ne10_float32_t *dst, ne10_vec4f_t *src1, ne10_vec4f_t *src2, ne10_uint32_t count) asm("ne10_dot_vec4f_neon")
Specific implementation of ne10_dot_vec4f using NEON intrinsics.
ne10_result_t ne10_rsbc_vec2f_neon(ne10_vec2f_t *dst, ne10_vec2f_t *src, const ne10_vec2f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_rsbc_vec2f using NEON intrinsics.
ne10_result_t ne10_rsbc_vec2f_c(ne10_vec2f_t *dst, ne10_vec2f_t *src, const ne10_vec2f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_rsbc_vec2f using plain C code.
ne10_result_t ne10_divc_vec4f_neon(ne10_vec4f_t *dst, ne10_vec4f_t *src, const ne10_vec4f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_divc_vec4f using NEON intrinsics.
ne10_result_t ne10_sub_vec2f_neon(ne10_vec2f_t *dst, ne10_vec2f_t *src1, ne10_vec2f_t *src2, ne10_uint32_t count) asm("ne10_sub_vec2f_neon")
Specific implementation of ne10_sub_vec2f using NEON intrinsics.
ne10_result_t ne10_sub_vec3f_neon(ne10_vec3f_t *dst, ne10_vec3f_t *src1, ne10_vec3f_t *src2, ne10_uint32_t count) asm("ne10_sub_vec3f_neon")
Specific implementation of ne10_sub_vec3f using NEON intrinsics.
ne10_result_t ne10_submat_4x4f_neon(ne10_mat4x4f_t *dst, ne10_mat4x4f_t *src1, ne10_mat4x4f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_submat_4x4f using NEON intrinsics.
ne10_result_t ne10_rsbc_vec3f_c(ne10_vec3f_t *dst, ne10_vec3f_t *src, const ne10_vec3f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_rsbc_vec3f using plain C code.
ne10_result_t ne10_transmat_3x3f_c(ne10_mat3x3f_t *dst, ne10_mat3x3f_t *src, ne10_uint32_t count)
Specific implementation of ne10_transmat_3x3f using plain C code.
ne10_result_t ne10_transmat_2x2f_c(ne10_mat2x2f_t *dst, ne10_mat2x2f_t *src, ne10_uint32_t count)
Specific implementation of ne10_transmat_2x2f using plain C code.
ne10_result_t ne10_addc_vec3f_c(ne10_vec3f_t *dst, ne10_vec3f_t *src, const ne10_vec3f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_addc_vec3f using plain C code.
ne10_result_t ne10_vmla_vec4f_c(ne10_vec4f_t *dst, ne10_vec4f_t *acc, ne10_vec4f_t *src1, ne10_vec4f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_vmla_vec4f using plain C code.
ne10_result_t ne10_dot_vec4f_c(ne10_float32_t *dst, ne10_vec4f_t *src1, ne10_vec4f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_dot_vec4f using plain C code.
ne10_result_t ne10_cross_vec3f_neon(ne10_vec3f_t *dst, ne10_vec3f_t *src1, ne10_vec3f_t *src2, ne10_uint32_t count) asm("ne10_cross_vec3f_neon")
Specific implementation of ne10_cross_vec3f using NEON intrinsics.
ne10_func_2args_t ftbl_2args[MAX_FUNC_COUNT]
ne10_result_t ne10_normalize_vec2f_neon(ne10_vec2f_t *dst, ne10_vec2f_t *src, ne10_uint32_t count) asm("ne10_normalize_vec2f_neon")
Specific implementation of ne10_normalize_vec2f using NEON intrinsics.
ne10_result_t ne10_detmat_3x3f_c(ne10_float32_t *dst, ne10_mat3x3f_t *src, ne10_uint32_t count)
Specific implementation of ne10_detmat_3x3f using plain C code.
ne10_result_t ne10_addmat_4x4f_neon(ne10_mat4x4f_t *dst, ne10_mat4x4f_t *src1, ne10_mat4x4f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_addmat_4x4f using NEON intrinsics.
ne10_result_t ne10_subc_vec4f_neon(ne10_vec4f_t *dst, ne10_vec4f_t *src, const ne10_vec4f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_subc_vec4f using NEON intrinsics.
ne10_result_t ne10_submat_4x4f_c(ne10_mat4x4f_t *dst, ne10_mat4x4f_t *src1, ne10_mat4x4f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_submat_4x4f using plain C code.
ne10_result_t ne10_invmat_2x2f_neon(ne10_mat2x2f_t *dst, ne10_mat2x2f_t *src, ne10_uint32_t count) asm("ne10_invmat_2x2f_neon")
Specific implementation of ne10_invmat_2x2f using NEON intrinsics.
ne10_result_t ne10_identitymat_3x3f_neon(ne10_mat3x3f_t *dst, ne10_uint32_t count) asm("ne10_identitymat_3x3f_neon")
Specific implementation of ne10_identitymat_3x3f using NEON intrinsics.
ne10_result_t ne10_invmat_4x4f_neon(ne10_mat4x4f_t *dst, ne10_mat4x4f_t *src, ne10_uint32_t count) asm("ne10_invmat_4x4f_neon")
Specific implementation of ne10_invmat_4x4f using NEON intrinsics.
ne10_result_t ne10_identitymat_2x2f_c(ne10_mat2x2f_t *dst, ne10_uint32_t count)
Specific implementation of ne10_identitymat_2x2f using plain C code.
ne10_result_t ne10_mulcmatvec_cm2x2f_v2f_neon(ne10_vec2f_t *dst, const ne10_mat2x2f_t *cst, ne10_vec2f_t *src, ne10_uint32_t count) asm("ne10_mulcmatvec_cm2x2f_v2f_neon")
Specific implementation of ne10_mulcmatvec_cm2x2f_v2f using NEON intrinsics.
ne10_result_t ne10_add_vec2f_neon(ne10_vec2f_t *dst, ne10_vec2f_t *src1, ne10_vec2f_t *src2, ne10_uint32_t count) asm("ne10_add_vec2f_neon")
Specific implementation of ne10_add_vec2f using NEON intrinsics.
ne10_result_t ne10_add_vec2f_c(ne10_vec2f_t *dst, ne10_vec2f_t *src1, ne10_vec2f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_add_vec2f using plain C code.
ne10_result_t ne10_add_vec3f_c(ne10_vec3f_t *dst, ne10_vec3f_t *src1, ne10_vec3f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_add_vec3f using plain C code.
ne10_result_t ne10_abs_vec4f_c(ne10_vec4f_t *dst, ne10_vec4f_t *src, ne10_uint32_t count)
Specific implementation of ne10_abs_vec4f using plain C code.
ne10_result_t ne10_identitymat_4x4f_neon(ne10_mat4x4f_t *dst, ne10_uint32_t count) asm("ne10_identitymat_4x4f_neon")
Specific implementation of ne10_identitymat_4x4f using NEON intrinsics.
ne10_result_t ne10_add_vec4f_c(ne10_vec4f_t *dst, ne10_vec4f_t *src1, ne10_vec4f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_add_vec4f using plain C code.
ne10_result_t ne10_vdiv_vec2f_neon(ne10_vec2f_t *dst, ne10_vec2f_t *src1, ne10_vec2f_t *src2, ne10_uint32_t count) asm("ne10_vdiv_vec2f_neon")
Specific implementation of ne10_vdiv_vec2f using NEON intrinsics.
ne10_result_t ne10_normalize_vec4f_neon(ne10_vec4f_t *dst, ne10_vec4f_t *src, ne10_uint32_t count) asm("ne10_normalize_vec4f_neon")
Specific implementation of ne10_normalize_vec4f using NEON intrinsics.
ne10_result_t ne10_vdiv_vec2f_c(ne10_vec2f_t *dst, ne10_vec2f_t *src1, ne10_vec2f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_vdiv_vec2f using plain C code.
ne10_result_t ne10_addc_vec2f_neon(ne10_vec2f_t *dst, ne10_vec2f_t *src, const ne10_vec2f_t *cst, ne10_uint32_t count) asm("ne10_addc_vec2f_neon")
Specific implementation of ne10_addc_vec2f using NEON intrinsics.
ne10_result_t ne10_addmat_2x2f_c(ne10_mat2x2f_t *dst, ne10_mat2x2f_t *src1, ne10_mat2x2f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_addmat_2x2f using plain C code.
ne10_result_t ne10_detmat_4x4f_neon(ne10_float32_t *dst, ne10_mat4x4f_t *src, ne10_uint32_t count) asm("ne10_detmat_4x4f_neon")
Specific implementation of ne10_detmat_4x4f using NEON intrinsics.
ne10_result_t ne10_mulc_vec3f_neon(ne10_vec3f_t *dst, ne10_vec3f_t *src, const ne10_vec3f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_mulc_vec3f using NEON intrinsics.
ne10_result_t ne10_identitymat_4x4f_c(ne10_mat4x4f_t *dst, ne10_uint32_t count)
Specific implementation of ne10_identitymat_4x4f using plain C code.
ne10_result_t ne10_mlac_vec3f_neon(ne10_vec3f_t *dst, ne10_vec3f_t *acc, ne10_vec3f_t *src, const ne10_vec3f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_mlac_vec3f using NEON intrinsics.
ne10_result_t ne10_mulcmatvec_cm4x4f_v4f_c(ne10_vec4f_t *dst, const ne10_mat4x4f_t *cst, ne10_vec4f_t *src, ne10_uint32_t count)
Specific implementation of ne10_mulcmatvec_cm4x4f_v4f using plain C code.
ne10_result_t ne10_addc_vec2f_c(ne10_vec2f_t *dst, ne10_vec2f_t *src, const ne10_vec2f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_addc_vec2f using plain C code.
ne10_result_t ne10_subc_float_c(ne10_float32_t *dst, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count)
Specific implementation of ne10_subc_float using plain C code.
ne10_result_t ne10_sub_float_neon(ne10_float32_t *dst, ne10_float32_t *src1, ne10_float32_t *src2, ne10_uint32_t count) asm("ne10_sub_float_neon")
Specific implementation of ne10_sub_float using NEON intrinsics.
ne10_result_t ne10_mulc_vec3f_c(ne10_vec3f_t *dst, ne10_vec3f_t *src, const ne10_vec3f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_mulc_vec3f using plain C code.
ne10_result_t ne10_mulc_float_neon(ne10_float32_t *dst, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count)
Specific implementation of ne10_mulc_float using NEON intrinsics.
ne10_result_t ne10_sub_vec4f_c(ne10_vec4f_t *dst, ne10_vec4f_t *src1, ne10_vec4f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_sub_vec4f using plain C code.
ne10_result_t ne10_mulmat_3x3f_neon(ne10_mat3x3f_t *dst, ne10_mat3x3f_t *src1, ne10_mat3x3f_t *src2, ne10_uint32_t count) asm("ne10_mulmat_3x3f_neon")
Specific implementation of ne10_mulmat_3x3f using NEON intrinsics.
ne10_result_t ne10_mulc_float_c(ne10_float32_t *dst, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count)
Specific implementation of ne10_mulc_float using plain C code.
ne10_result_t ne10_len_vec4f_c(ne10_float32_t *dst, ne10_vec4f_t *src, ne10_uint32_t count)
Specific implementation of ne10_len_vec4f using plain C code.
ne10_result_t ne10_setc_vec4f_neon(ne10_vec4f_t *dst, const ne10_vec4f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_setc_vec4f using NEON intrinsics.
ne10_result_t ne10_identitymat_3x3f_c(ne10_mat3x3f_t *dst, ne10_uint32_t count)
Specific implementation of ne10_identitymat_3x3f using plain C code.
ne10_result_t ne10_normalize_vec4f_c(ne10_vec4f_t *dst, ne10_vec4f_t *src, ne10_uint32_t count)
Specific implementation of ne10_normalize_vec4f using plain C code.
ne10_result_t ne10_vmla_vec3f_c(ne10_vec3f_t *dst, ne10_vec3f_t *acc, ne10_vec3f_t *src1, ne10_vec3f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_vmla_vec3f using plain C code.
ne10_result_t ne10_normalize_vec3f_c(ne10_vec3f_t *dst, ne10_vec3f_t *src, ne10_uint32_t count)
Specific implementation of ne10_normalize_vec3f using plain C code.
ne10_result_t ne10_mulcmatvec_cm3x3f_v3f_neon(ne10_vec3f_t *dst, const ne10_mat3x3f_t *cst, ne10_vec3f_t *src, ne10_uint32_t count) asm("ne10_mulcmatvec_cm3x3f_v3f_neon")
Specific implementation of ne10_mulcmatvec_cm3x3f_v3f using NEON intrinsics.
ne10_result_t ne10_setc_vec2f_neon(ne10_vec2f_t *dst, const ne10_vec2f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_setc_vec2f using NEON intrinsics.
void my_test_teardown(void)
ne10_result_t ne10_submat_2x2f_neon(ne10_mat2x2f_t *dst, ne10_mat2x2f_t *src1, ne10_mat2x2f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_submat_2x2f using NEON intrinsics.
ne10_result_t ne10_vdiv_vec4f_neon(ne10_vec4f_t *dst, ne10_vec4f_t *src1, ne10_vec4f_t *src2, ne10_uint32_t count) asm("ne10_vdiv_vec4f_neon")
Specific implementation of ne10_vdiv_vec4f using NEON intrinsics.
ne10_result_t ne10_len_vec2f_c(ne10_float32_t *dst, ne10_vec2f_t *src, ne10_uint32_t count)
Specific implementation of ne10_len_vec2f using plain C code.
ne10_result_t ne10_divc_vec3f_neon(ne10_vec3f_t *dst, ne10_vec3f_t *src, const ne10_vec3f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_divc_vec3f using NEON intrinsics.
ne10_result_t ne10_abs_float_c(ne10_float32_t *dst, ne10_float32_t *src, ne10_uint32_t count)
Specific implementation of ne10_abs_float using plain C code.
ne10_result_t ne10_rsbc_vec3f_neon(ne10_vec3f_t *dst, ne10_vec3f_t *src, const ne10_vec3f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_rsbc_vec3f using NEON intrinsics.
ne10_result_t ne10_addc_float_neon(ne10_float32_t *dst, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count) asm("ne10_addc_float_neon")
Specific implementation of ne10_addc_float using NEON intrinsics.
ne10_result_t ne10_vmla_vec2f_c(ne10_vec2f_t *dst, ne10_vec2f_t *acc, ne10_vec2f_t *src1, ne10_vec2f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_vmla_vec2f using plain C code.
ne10_result_t ne10_vmul_vec3f_c(ne10_vec3f_t *dst, ne10_vec3f_t *src1, ne10_vec3f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_vmul_vec3f using plain C code.
ne10_result_t ne10_identitymat_2x2f_neon(ne10_mat2x2f_t *dst, ne10_uint32_t count) asm("ne10_identitymat_2x2f_neon")
Specific implementation of ne10_identitymat_2x2f using NEON intrinsics.
ne10_result_t ne10_vmla_vec2f_neon(ne10_vec2f_t *dst, ne10_vec2f_t *acc, ne10_vec2f_t *src1, ne10_vec2f_t *src2, ne10_uint32_t count) asm("ne10_vmla_vec2f_neon")
Specific implementation of ne10_vmla_vec2f using NEON intrinsics.
ne10_result_t ne10_vdiv_vec3f_neon(ne10_vec3f_t *dst, ne10_vec3f_t *src1, ne10_vec3f_t *src2, ne10_uint32_t count) asm("ne10_vdiv_vec3f_neon")
Specific implementation of ne10_vdiv_vec3f using NEON intrinsics.
ne10_result_t ne10_mulcmatvec_cm2x2f_v2f_c(ne10_vec2f_t *dst, const ne10_mat2x2f_t *cst, ne10_vec2f_t *src, ne10_uint32_t count)
Specific implementation of ne10_mulcmatvec_cm2x2f_v2f using plain C code.
ne10_result_t ne10_div_float_c(ne10_float32_t *dst, ne10_float32_t *src1, ne10_float32_t *src2, ne10_uint32_t count)
Specific implementation of ne10_div_float using plain C code.
ne10_result_t ne10_mul_float_neon(ne10_float32_t *dst, ne10_float32_t *src1, ne10_float32_t *src2, ne10_uint32_t count) asm("ne10_mul_float_neon")
Specific implementation of ne10_mul_float using NEON intrinsics.
ne10_result_t ne10_vmul_vec2f_c(ne10_vec2f_t *dst, ne10_vec2f_t *src1, ne10_vec2f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_vmul_vec2f using plain C code.
ne10_result_t ne10_mulmat_2x2f_neon(ne10_mat2x2f_t *dst, ne10_mat2x2f_t *src1, ne10_mat2x2f_t *src2, ne10_uint32_t count) asm("ne10_mulmat_2x2f_neon")
Specific implementation of ne10_mulmat_2x2f using NEON intrinsics.
ne10_result_t ne10_detmat_2x2f_c(ne10_float32_t *dst, ne10_mat2x2f_t *src, ne10_uint32_t count)
Specific implementation of ne10_detmat_2x2f using plain C code.
ne10_result_t ne10_dot_vec3f_c(ne10_float32_t *dst, ne10_vec3f_t *src1, ne10_vec3f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_dot_vec3f using plain C code.
ne10_result_t ne10_mulcmatvec_cm4x4f_v4f_neon(ne10_vec4f_t *dst, const ne10_mat4x4f_t *cst, ne10_vec4f_t *src, ne10_uint32_t count) asm("ne10_mulcmatvec_cm4x4f_v4f_neon")
Specific implementation of ne10_mulcmatvec_cm4x4f_v4f using NEON intrinsics.
ne10_func_5args_cst_t ftbl_5args_cst[MAX_FUNC_COUNT]
ne10_result_t ne10_transmat_3x3f_neon(ne10_mat3x3f_t *dst, ne10_mat3x3f_t *src, ne10_uint32_t count) asm("ne10_transmat_3x3f_neon")
Specific implementation of ne10_transmat_3x3f using NEON intrinsics.
ne10_result_t ne10_vmul_vec4f_neon(ne10_vec4f_t *dst, ne10_vec4f_t *src1, ne10_vec4f_t *src2, ne10_uint32_t count) asm("ne10_vmul_vec4f_neon")
Specific implementation of ne10_vmul_vec4f using NEON intrinsics.
ne10_result_t ne10_mlac_float_neon(ne10_float32_t *dst, ne10_float32_t *acc, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count)
Specific implementation of ne10_mlac_float using NEON intrinsics.
ne10_result_t ne10_detmat_4x4f_c(ne10_float32_t *dst, ne10_mat4x4f_t *src, ne10_uint32_t count)
Specific implementation of ne10_detmat_4x4f using plain C code.
ne10_result_t ne10_submat_3x3f_neon(ne10_mat3x3f_t *dst, ne10_mat3x3f_t *src1, ne10_mat3x3f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_submat_3x3f using NEON intrinsics.
ne10_result_t ne10_addc_vec4f_c(ne10_vec4f_t *dst, ne10_vec4f_t *src, const ne10_vec4f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_addc_vec4f using plain C code.
ne10_result_t ne10_mlac_vec3f_c(ne10_vec3f_t *dst, ne10_vec3f_t *acc, ne10_vec3f_t *src, const ne10_vec3f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_mlac_vec3f using plain C code.
ne10_func_3args_cst_t ftbl_3args_cst[MAX_FUNC_COUNT]
ne10_result_t ne10_addc_float_c(ne10_float32_t *dst, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count)
Specific implementation of ne10_addc_float using plain C code.
ne10_result_t ne10_setc_vec3f_neon(ne10_vec3f_t *dst, const ne10_vec3f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_setc_vec3f using NEON intrinsics.
ne10_result_t ne10_setc_vec3f_c(ne10_vec3f_t *dst, const ne10_vec3f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_setc_vec3f using plain C code.
ne10_result_t ne10_abs_vec3f_neon(ne10_vec3f_t *dst, ne10_vec3f_t *src, ne10_uint32_t count) asm("ne10_abs_vec3f_neon")
Specific implementation of ne10_abs_vec3f using NEON intrinsics.
ne10_func_4args_cst_t ftbl_4args_cst[MAX_FUNC_COUNT]
ne10_result_t ne10_mulc_vec4f_c(ne10_vec4f_t *dst, ne10_vec4f_t *src, const ne10_vec4f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_mulc_vec4f using plain C code.
ne10_result_t ne10_rsbc_float_neon(ne10_float32_t *dst, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count)
Specific implementation of ne10_rsbc_float using NEON intrinsics.
ne10_result_t ne10_len_vec2f_neon(ne10_float32_t *dst, ne10_vec2f_t *src, ne10_uint32_t count) asm("ne10_len_vec2f_neon")
Specific implementation of ne10_len_vec2f using NEON intrinsics.
ne10_result_t ne10_abs_vec2f_c(ne10_vec2f_t *dst, ne10_vec2f_t *src, ne10_uint32_t count)
Specific implementation of ne10_abs_vec2f using plain C code.
ne10_result_t ne10_mulmat_2x2f_c(ne10_mat2x2f_t *dst, ne10_mat2x2f_t *src1, ne10_mat2x2f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_mulmat_2x2f using plain C code.
ne10_result_t ne10_setc_vec2f_c(ne10_vec2f_t *dst, const ne10_vec2f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_setc_vec2f using plain C code.
ne10_result_t ne10_submat_3x3f_c(ne10_mat3x3f_t *dst, ne10_mat3x3f_t *src1, ne10_mat3x3f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_submat_3x3f using plain C code.
ne10_result_t ne10_abs_vec3f_c(ne10_vec3f_t *dst, ne10_vec3f_t *src, ne10_uint32_t count)
Specific implementation of ne10_abs_vec3f using plain C code.
ne10_result_t ne10_add_vec3f_neon(ne10_vec3f_t *dst, ne10_vec3f_t *src1, ne10_vec3f_t *src2, ne10_uint32_t count) asm("ne10_add_vec3f_neon")
Specific implementation of ne10_add_vec3f using NEON intrinsics.
ne10_result_t ne10_abs_float_neon(ne10_float32_t *dst, ne10_float32_t *src, ne10_uint32_t count) asm("ne10_abs_float_neon")
Specific implementation of ne10_abs_float using NEON intrinsics.
ne10_func_5args_t ftbl_5args[MAX_FUNC_COUNT]
ne10_result_t ne10_divc_float_neon(ne10_float32_t *dst, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count)
Specific implementation of ne10_divc_float using NEON intrinsics.
ne10_result_t ne10_vmul_vec3f_neon(ne10_vec3f_t *dst, ne10_vec3f_t *src1, ne10_vec3f_t *src2, ne10_uint32_t count) asm("ne10_vmul_vec3f_neon")
Specific implementation of ne10_vmul_vec3f using NEON intrinsics.
ne10_result_t ne10_rsbc_vec4f_c(ne10_vec4f_t *dst, ne10_vec4f_t *src, const ne10_vec4f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_rsbc_vec4f using plain C code.
ne10_result_t ne10_dot_vec2f_c(ne10_float32_t *dst, ne10_vec2f_t *src1, ne10_vec2f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_dot_vec2f using plain C code.
ne10_result_t ne10_subc_vec2f_c(ne10_vec2f_t *dst, ne10_vec2f_t *src, const ne10_vec2f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_subc_vec2f using plain C code.
ne10_result_t ne10_transmat_4x4f_c(ne10_mat4x4f_t *dst, ne10_mat4x4f_t *src, ne10_uint32_t count)
Specific implementation of ne10_transmat_4x4f using plain C code.
ne10_result_t ne10_vdiv_vec3f_c(ne10_vec3f_t *dst, ne10_vec3f_t *src1, ne10_vec3f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_vdiv_vec3f using plain C code.
ne10_result_t ne10_invmat_4x4f_c(ne10_mat4x4f_t *dst, ne10_mat4x4f_t *src, ne10_uint32_t count)
Specific implementation of ne10_invmat_4x4f using plain C code.
ne10_result_t ne10_vmla_vec4f_neon(ne10_vec4f_t *dst, ne10_vec4f_t *acc, ne10_vec4f_t *src1, ne10_vec4f_t *src2, ne10_uint32_t count) asm("ne10_vmla_vec4f_neon")
Specific implementation of ne10_vmla_vec4f using NEON intrinsics.
ne10_result_t ne10_vdiv_vec4f_c(ne10_vec4f_t *dst, ne10_vec4f_t *src1, ne10_vec4f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_vdiv_vec4f using plain C code.
ne10_result_t ne10_vmul_vec2f_neon(ne10_vec2f_t *dst, ne10_vec2f_t *src1, ne10_vec2f_t *src2, ne10_uint32_t count) asm("ne10_vmul_vec2f_neon")
Specific implementation of ne10_vmul_vec2f using NEON intrinsics.
ne10_result_t ne10_divc_vec3f_c(ne10_vec3f_t *dst, ne10_vec3f_t *src, const ne10_vec3f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_divc_vec3f using plain C code.
ne10_result_t ne10_detmat_3x3f_neon(ne10_float32_t *dst, ne10_mat3x3f_t *src, ne10_uint32_t count) asm("ne10_detmat_3x3f_neon")
Specific implementation of ne10_detmat_3x3f using NEON intrinsics.
ne10_result_t ne10_mulmat_3x3f_c(ne10_mat3x3f_t *dst, ne10_mat3x3f_t *src1, ne10_mat3x3f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_mulmat_3x3f using plain C code.
ne10_func_3args_t ftbl_3args[MAX_FUNC_COUNT]
ne10_result_t ne10_transmat_2x2f_neon(ne10_mat2x2f_t *dst, ne10_mat2x2f_t *src, ne10_uint32_t count) asm("ne10_transmat_2x2f_neon")
Specific implementation of ne10_transmat_2x2f using NEON intrinsics.
ne10_result_t ne10_dot_vec2f_neon(ne10_float32_t *dst, ne10_vec2f_t *src1, ne10_vec2f_t *src2, ne10_uint32_t count) asm("ne10_dot_vec2f_neon")
Specific implementation of ne10_dot_vec2f using NEON intrinsics.
ne10_result_t ne10_vmla_vec3f_neon(ne10_vec3f_t *dst, ne10_vec3f_t *acc, ne10_vec3f_t *src1, ne10_vec3f_t *src2, ne10_uint32_t count) asm("ne10_vmla_vec3f_neon")
Specific implementation of ne10_vmla_vec3f using NEON intrinsics.
ne10_result_t ne10_sub_float_c(ne10_float32_t *dst, ne10_float32_t *src1, ne10_float32_t *src2, ne10_uint32_t count)
Specific implementation of ne10_sub_float using plain C code.
ne10_result_t ne10_mul_float_c(ne10_float32_t *dst, ne10_float32_t *src1, ne10_float32_t *src2, ne10_uint32_t count)
Specific implementation of ne10_mul_float using plain C code.
ne10_result_t ne10_addmat_3x3f_c(ne10_mat3x3f_t *dst, ne10_mat3x3f_t *src1, ne10_mat3x3f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_addmat_3x3f using plain C code.
ne10_result_t ne10_len_vec4f_neon(ne10_float32_t *dst, ne10_vec4f_t *src, ne10_uint32_t count) asm("ne10_len_vec4f_neon")
Specific implementation of ne10_len_vec4f using NEON intrinsics.
ne10_result_t ne10_sub_vec4f_neon(ne10_vec4f_t *dst, ne10_vec4f_t *src1, ne10_vec4f_t *src2, ne10_uint32_t count) asm("ne10_sub_vec4f_neon")
Specific implementation of ne10_sub_vec4f using NEON intrinsics.
ne10_result_t ne10_dot_vec3f_neon(ne10_float32_t *dst, ne10_vec3f_t *src1, ne10_vec3f_t *src2, ne10_uint32_t count) asm("ne10_dot_vec3f_neon")
Specific implementation of ne10_dot_vec3f using NEON intrinsics.
ne10_result_t ne10_mulmat_4x4f_neon(ne10_mat4x4f_t *dst, ne10_mat4x4f_t *src1, ne10_mat4x4f_t *src2, ne10_uint32_t count) asm("ne10_mulmat_4x4f_neon")
Specific implementation of ne10_mulmat_4x4f using NEON intrinsics.
ne10_result_t ne10_rsbc_float_c(ne10_float32_t *dst, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count)
Specific implementation of ne10_rsbc_float using plain C code.
ne10_result_t ne10_setc_vec4f_c(ne10_vec4f_t *dst, const ne10_vec4f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_setc_vec4f using plain C code.
ne10_result_t ne10_add_float_c(ne10_float32_t *dst, ne10_float32_t *src1, ne10_float32_t *src2, ne10_uint32_t count)
Specific implementation of ne10_add_float using plain C code.
ne10_result_t ne10_subc_float_neon(ne10_float32_t *dst, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count)
Specific implementation of ne10_subc_float using NEON intrinsics.
ne10_result_t ne10_mlac_vec4f_neon(ne10_vec4f_t *dst, ne10_vec4f_t *acc, ne10_vec4f_t *src, const ne10_vec4f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_mlac_vec4f using NEON intrinsics.
ne10_result_t ne10_transmat_4x4f_neon(ne10_mat4x4f_t *dst, ne10_mat4x4f_t *src, ne10_uint32_t count) asm("ne10_transmat_4x4f_neon")
Specific implementation of ne10_transmat_4x4f using NEON intrinsics.
ne10_result_t ne10_add_float_neon(ne10_float32_t *dst, ne10_float32_t *src1, ne10_float32_t *src2, ne10_uint32_t count) asm("ne10_add_float_neon")
Specific implementation of ne10_add_float using NEON intrinsics.
ne10_result_t ne10_vmul_vec4f_c(ne10_vec4f_t *dst, ne10_vec4f_t *src1, ne10_vec4f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_vmul_vec4f using plain C code.
ne10_result_t ne10_sub_vec2f_c(ne10_vec2f_t *dst, ne10_vec2f_t *src1, ne10_vec2f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_sub_vec2f using plain C code.
ne10_result_t ne10_len_vec3f_neon(ne10_float32_t *dst, ne10_vec3f_t *src, ne10_uint32_t count) asm("ne10_len_vec3f_neon")
Specific implementation of ne10_len_vec3f using NEON intrinsics.
ne10_result_t ne10_invmat_3x3f_neon(ne10_mat3x3f_t *dst, ne10_mat3x3f_t *src, ne10_uint32_t count) asm("ne10_invmat_3x3f_neon")
Specific implementation of ne10_invmat_3x3f using NEON intrinsics.
ne10_result_t ne10_submat_2x2f_c(ne10_mat2x2f_t *dst, ne10_mat2x2f_t *src1, ne10_mat2x2f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_submat_2x2f using plain C code.
ne10_result_t ne10_mlac_vec2f_neon(ne10_vec2f_t *dst, ne10_vec2f_t *acc, ne10_vec2f_t *src, const ne10_vec2f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_mlac_vec2f using NEON intrinsics.
ne10_result_t ne10_detmat_2x2f_neon(ne10_float32_t *dst, ne10_mat2x2f_t *src, ne10_uint32_t count) asm("ne10_detmat_2x2f_neon")
Specific implementation of ne10_detmat_2x2f using NEON intrinsics.
ne10_result_t ne10_div_float_neon(ne10_float32_t *dst, ne10_float32_t *src1, ne10_float32_t *src2, ne10_uint32_t count) asm("ne10_div_float_neon")
Specific implementation of ne10_div_float using NEON intrinsics.
ne10_result_t ne10_add_vec4f_neon(ne10_vec4f_t *dst, ne10_vec4f_t *src1, ne10_vec4f_t *src2, ne10_uint32_t count) asm("ne10_add_vec4f_neon")
Specific implementation of ne10_add_vec4f using NEON intrinsics.
ne10_result_t ne10_invmat_3x3f_c(ne10_mat3x3f_t *dst, ne10_mat3x3f_t *src, ne10_uint32_t count)
Specific implementation of ne10_invmat_3x3f using plain C code.
ne10_result_t ne10_setc_float_neon(ne10_float32_t *dst, const ne10_float32_t cst, ne10_uint32_t count)
Specific implementation of ne10_setc_float using NEON intrinsics.
ne10_result_t ne10_addmat_2x2f_neon(ne10_mat2x2f_t *dst, ne10_mat2x2f_t *src1, ne10_mat2x2f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_addmat_2x2f using NEON intrinsics.
ne10_result_t ne10_normalize_vec2f_c(ne10_vec2f_t *dst, ne10_vec2f_t *src, ne10_uint32_t count)
Specific implementation of ne10_normalize_vec2f using plain C code.
ne10_result_t ne10_mulc_vec2f_neon(ne10_vec2f_t *dst, ne10_vec2f_t *src, const ne10_vec2f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_mulc_vec2f using NEON intrinsics.
ne10_result_t ne10_divc_vec4f_c(ne10_vec4f_t *dst, ne10_vec4f_t *src, const ne10_vec4f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_divc_vec4f using plain C code.
ne10_result_t ne10_subc_vec3f_c(ne10_vec3f_t *dst, ne10_vec3f_t *src, const ne10_vec3f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_subc_vec3f using plain C code.
ne10_result_t ne10_rsbc_vec4f_neon(ne10_vec4f_t *dst, ne10_vec4f_t *src, const ne10_vec4f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_rsbc_vec4f using NEON intrinsics.
ne10_result_t ne10_addmat_4x4f_c(ne10_mat4x4f_t *dst, ne10_mat4x4f_t *src1, ne10_mat4x4f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_addmat_4x4f using plain C code.
ne10_result_t ne10_divc_vec2f_neon(ne10_vec2f_t *dst, ne10_vec2f_t *src, const ne10_vec2f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_divc_vec2f using NEON intrinsics.
ne10_result_t ne10_subc_vec4f_c(ne10_vec4f_t *dst, ne10_vec4f_t *src, const ne10_vec4f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_subc_vec4f using plain C code.
ne10_result_t ne10_mulc_vec4f_neon(ne10_vec4f_t *dst, ne10_vec4f_t *src, const ne10_vec4f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_mulc_vec4f using NEON intrinsics.
ne10_func_4args_t ftbl_4args[MAX_FUNC_COUNT]
ne10_result_t ne10_mulcmatvec_cm3x3f_v3f_c(ne10_vec3f_t *dst, const ne10_mat3x3f_t *cst, ne10_vec3f_t *src, ne10_uint32_t count)
Specific implementation of ne10_mulcmatvec_cm3x3f_v3f using plain C code.
#define MAX_VEC_COMPONENTS
ne10_result_t ne10_mulmat_4x4f_c(ne10_mat4x4f_t *dst, ne10_mat4x4f_t *src1, ne10_mat4x4f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_mulmat_4x4f using plain C code.
ne10_result_t ne10_subc_vec2f_neon(ne10_vec2f_t *dst, ne10_vec2f_t *src, const ne10_vec2f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_subc_vec2f using NEON intrinsics.
ne10_result_t ne10_mlac_float_c(ne10_float32_t *dst, ne10_float32_t *acc, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count)
Specific implementation of ne10_mlac_float using plain C code.
ne10_result_t ne10_mla_float_c(ne10_float32_t *dst, ne10_float32_t *acc, ne10_float32_t *src1, ne10_float32_t *src2, ne10_uint32_t count)
Specific implementation of ne10_mla_float using plain C code.
ne10_result_t ne10_divc_vec2f_c(ne10_vec2f_t *dst, ne10_vec2f_t *src, const ne10_vec2f_t *cst, ne10_uint32_t count)
Specific implementation of ne10_divc_vec2f using plain C code.
ne10_result_t ne10_invmat_2x2f_c(ne10_mat2x2f_t *dst, ne10_mat2x2f_t *src, ne10_uint32_t count)
Specific implementation of ne10_invmat_2x2f using plain C code.
ne10_result_t ne10_mla_float_neon(ne10_float32_t *dst, ne10_float32_t *acc, ne10_float32_t *src1, ne10_float32_t *src2, ne10_uint32_t count) asm("ne10_mla_float_neon")
Specific implementation of ne10_mla_float using NEON intrinsics.
ne10_result_t ne10_sub_vec3f_c(ne10_vec3f_t *dst, ne10_vec3f_t *src1, ne10_vec3f_t *src2, ne10_uint32_t count)
Specific implementation of ne10_sub_vec3f using plain C code.
ne10_result_t ne10_setc_float_c(ne10_float32_t *dst, const ne10_float32_t cst, ne10_uint32_t count)
Specific implementation of ne10_setc_float using plain C code.