Project Ne10
An open, optimized software library for the ARM architecture.
macros.h
Go to the documentation of this file.
1 /*
2  * Copyright 2011-16 ARM Limited and Contributors.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the <organization> nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL ARM Limited and Contributors. BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /*
29  * NE10 Library : common/macros.h
30  */
31 
32 #include "factor.h"
33 
34 /*
35  * The following macros are used within some of the NEON math function implementations.
36  *
37  * With the exception of the VEC4F macros, which can always wholly fill NEON SIMD vectors,
38  * each macro takes two parameters -- loopCode1 for the code to be run within the main
39  * SIMD loop, and loopCode2 for the code to be run in processing any leftover elements.
40  * The details of the variables that are exposed within these macros can be viewed in the
41  * specific MAINLOOP and SECONDLOOP sub-macros that each macro utilizes.
42  */
43 
44 #define NE10_DstSrcCst_DO_COUNT_TIMES_FLOAT_NEON(loopCode1, loopCode2) { \
45  NE10_CHECKPOINTER_DstSrcCst; \
46  float32x4_t n_cst = { cst, cst, cst, cst }; \
47  NE10_DstSrcCst_OPERATION_FLOAT_NEON( \
48  NE10_DstSrcCst_MAINLOOP_FLOAT_NEON(loopCode1); , \
49  NE10_DstSrcCst_SECONDLOOP_FLOAT_NEON(loopCode2); \
50  ); \
51 }
52 
53 #define NE10_DstSrcCst_DO_COUNT_TIMES_VEC2F_NEON(loopCode1, loopCode2) { \
54  NE10_CHECKPOINTER_DstSrcCst; \
55  NE10_DstSrcCst_OPERATION_VEC2F_NEON( \
56  NE10_DstSrcCst_MAINLOOP_VEC2F_NEON(loopCode1); , \
57  NE10_DstSrcCst_SECONDLOOP_VEC2F_NEON(loopCode2); \
58  ); \
59 }
60 
61 /* This macro uses interleaving to boost the performance */
62 #define NE10_DstSrcCst_DO_COUNT_TIMES_VEC3F_NEON(loopCode1, loopCode2) { \
63  NE10_CHECKPOINTER_DstSrcCst; \
64  NE10_DstSrcCst_OPERATION_VEC3F_NEON( \
65  NE10_DstSrcCst_MAINLOOP_VEC3F_NEON(loopCode1); , \
66  NE10_DstSrcCst_SECONDLOOP_VEC3F_NEON(loopCode2); \
67  ); \
68 }
69 
70 #define NE10_DstSrcCst_DO_COUNT_TIMES_VEC4F_NEON(loopCode) { \
71  NE10_CHECKPOINTER_DstSrcCst; \
72  NE10_DstSrcCst_OPERATION_VEC4F_NEON( \
73  NE10_DstSrcCst_MAINLOOP_VEC4F_NEON(loopCode); \
74  ); \
75 }
76 
77 #define NE10_DstAccSrcCst_DO_COUNT_TIMES_FLOAT_NEON(loopCode1, loopCode2) { \
78  float32x4_t n_acc; \
79  float32x4_t n_cst = { cst, cst, cst, cst }; \
80  NE10_CHECKPOINTER_DstAccSrcCst; \
81  NE10_DstAccSrcCst_OPERATION_FLOAT_NEON( \
82  NE10_DstAccSrcCst_MAINLOOP_FLOAT_NEON(loopCode1); , \
83  NE10_DstAccSrcCst_SECONDLOOP_FLOAT_NEON(loopCode2); \
84  ); \
85 }
86 
87 #define NE10_DstAccSrcCst_DO_COUNT_TIMES_VEC2F_NEON(loopCode1, loopCode2) { \
88  float32x4_t n_acc; \
89  NE10_CHECKPOINTER_DstAccSrcCst; \
90  NE10_DstAccSrcCst_OPERATION_VEC2F_NEON( \
91  NE10_DstAccSrcCst_MAINLOOP_VEC2F_NEON(loopCode1); , \
92  NE10_DstAccSrcCst_SECONDLOOP_VEC2F_NEON(loopCode2); \
93  ); \
94 }
95 
96 #define NE10_DstAccSrcCst_DO_COUNT_TIMES_VEC3F_NEON(loopCode1, loopCode2) { \
97  float32x4_t n_acc1, n_acc2, n_acc3; \
98  NE10_CHECKPOINTER_DstAccSrcCst; \
99  NE10_DstAccSrcCst_OPERATION_VEC3F_NEON( \
100  NE10_DstAccSrcCst_MAINLOOP_VEC3F_NEON(loopCode1); , \
101  NE10_DstAccSrcCst_SECONDLOOP_VEC3F_NEON(loopCode2); \
102  ); \
103 }
104 
105 #define NE10_DstAccSrcCst_DO_COUNT_TIMES_VEC4F_NEON(loopCode) { \
106  float32x4_t n_acc; \
107  NE10_CHECKPOINTER_DstAccSrcCst; \
108  NE10_DstAccSrcCst_OPERATION_VEC4F_NEON( \
109  NE10_DstAccSrcCst_MAINLOOP_VEC4F_NEON(loopCode); \
110  ); \
111 }
112 
113 #define NE10_DstCst_DO_COUNT_TIMES_FLOAT_NEON(loopCode1, loopCode2) { \
114  float32x4_t n_cst = { cst, cst, cst, cst }; \
115  NE10_CHECKPOINTER_DstCst; \
116  NE10_DstCst_OPERATION_FLOAT_NEON( \
117  NE10_DstCst_MAINLOOP_FLOAT_NEON(loopCode1); , \
118  NE10_DstCst_SECONDLOOP_FLOAT_NEON(loopCode2); \
119  ); \
120 }
121 
122 #define NE10_DstCst_DO_COUNT_TIMES_VEC2F_NEON(loopCode1, loopCode2) { \
123  NE10_CHECKPOINTER_DstCst; \
124  NE10_DstCst_OPERATION_VEC2F_NEON( \
125  NE10_DstCst_MAINLOOP_VEC2F_NEON(loopCode1); , \
126  NE10_DstCst_SECONDLOOP_VEC2F_NEON(loopCode2); \
127  ); \
128 }
129 
130 /* This macro uses interleaving to boost the performance */
131 #define NE10_DstCst_DO_COUNT_TIMES_VEC3F_NEON(loopCode1, loopCode2) { \
132  NE10_CHECKPOINTER_DstCst; \
133  NE10_DstCst_OPERATION_VEC3F_NEON( \
134  NE10_DstCst_MAINLOOP_VEC3F_NEON(loopCode1); , \
135  NE10_DstCst_SECONDLOOP_VEC3F_NEON(loopCode2); \
136  ); \
137 }
138 
139 #define NE10_DstCst_DO_COUNT_TIMES_VEC4F_NEON(loopCode) { \
140  NE10_CHECKPOINTER_DstCst; \
141  NE10_DstCst_OPERATION_VEC4F_NEON( \
142  NE10_DstCst_MAINLOOP_VEC4F_NEON(loopCode); \
143  ); \
144 }
145 
146 #define NE10_DstSrc1Src2_DO_COUNT_TIMES_FLOAT_NEON(loopCode1, loopCode2) { \
147  float32x4_t n_src2; \
148  NE10_CHECKPOINTER_DstSrc1Src2; \
149  NE10_DstSrc1Src2_OPERATION_FLOAT_NEON( \
150  NE10_DstSrc1Src2_MAINLOOP_FLOAT_NEON(loopCode1); , \
151  NE10_DstSrc1Src2_SECONDLOOP_FLOAT_NEON(loopCode2); \
152  ); \
153 }
154 
155 #define NE10_DstAccSrc1Src2_DO_COUNT_TIMES_FLOAT_NEON(loopCode1, loopCode2) { \
156  float32x4_t n_acc; \
157  float32x4_t n_src2; \
158  NE10_CHECKPOINTER_DstAccSrc1Src2; \
159  NE10_DstAccSrc1Src2_OPERATION_FLOAT_NEON( \
160  NE10_DstAccSrc1Src2_MAINLOOP_FLOAT_NEON(loopCode1); , \
161  NE10_DstAccSrc1Src2_SECONDLOOP_FLOAT_NEON(loopCode2); \
162  ); \
163 }