Project Ne10
An open, optimized software library for the ARM architecture.
Main Page
Related Pages
Modules
Classes
Files
Examples
File List
File Members
common
factor.h
Go to the documentation of this file.
1
/*
2
* Copyright 2011-16 ARM Limited and Contributors.
3
* All rights reserved.
4
*
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions are met:
7
* * Redistributions of source code must retain the above copyright
8
* notice, this list of conditions and the following disclaimer.
9
* * Redistributions in binary form must reproduce the above copyright
10
* notice, this list of conditions and the following disclaimer in the
11
* documentation and/or other materials provided with the distribution.
12
* * Neither the name of ARM Limited nor the
13
* names of its contributors may be used to endorse or promote products
14
* derived from this software without specific prior written permission.
15
*
16
* THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
* DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
*/
27
28
/*
29
* NE10 Library : common/factor.h
30
*/
31
32
// Typebuilding macros (slight difference between toolchain versions on intrinsics)
33
#define FLOAT32_2x3(x1,y1,x2,y2,x3,y3) \
34
{{ \
35
{x1, y1}, {x2,y2}, {x3,y3} \
36
}}
37
38
// There are several categories of functions that share common code. Different groups of
39
// functions take different number of inputs.
40
//
41
// Group 1 = Functions that take a dst, a src, and a cst ("DstSrcCst" for short)
42
// Group 2 = Those that take a dst, an acc, a src, and a cst ("DstAccSrcCst" for short)
43
// Group 3 = The ones that take a dst, and a cst only ("DstCst" for short)
44
//
45
// Group 4 = These take a dst, and two src inputs, src2 and scr2 ("DstSrc1Src2")
46
// Group 5 = These take a dst, an acc, and two src inputs ("DstAccSrc1Src2")
47
48
// A few macros to check pointers and their address range to make sure there's
49
// no unwanted overlap between any two of them
50
#define NE10_CHECKPOINTER_DstSrcCst \
51
if ( (void *)dst < (void *)src ) \
52
{ assert ( (void *)dst + count <= (void *)src ); } \
53
else if ( (void *)dst > (void *)src ) \
54
{ assert ( (void *)src + count <= (void *)dst ); }
55
56
#define NE10_CHECKPOINTER_DstSrc NE10_CHECKPOINTER_DstSrcCst
57
58
#define NE10_CHECKPOINTER_3POINTER(arg1, arg2, arg3) \
59
if ( (void *)arg1 < (void *)arg2 ) \
60
{ assert ( (void *)arg1 + count <= (void *)arg2 ); } \
61
else if ( (void *)arg1 > (void *)arg2 ) \
62
{ assert ( (void *)arg2 + count <= (void *)arg1 ); } \
63
if ( (void *)arg1 < (void *)arg3 ) \
64
{ assert ( (void *)arg1 + count <= (void *)arg3 ); } \
65
else if ( (void *)arg1 > (void *)arg3 ) \
66
{ assert ( (void *)arg3 + count <= (void *)arg1 ); } \
67
if ( (void *)arg3 < (void *)arg2 ) \
68
{ assert ( (void *)arg3 + count <= (void *)arg2 ); } \
69
else if ( (void *)arg3 > (void *)arg2 ) \
70
{ assert ( (void *)arg2 + count <= (void *)arg3 ); }
71
72
#define NE10_CHECKPOINTER_4POINTER(arg1, arg2, arg3, arg4) \
73
NE10_CHECKPOINTER_3POINTER(arg1, arg2, arg3) \
74
if ( (void *)arg1 < (void *)arg4 ) \
75
{ assert ( (void *)arg1 + count <= (void *)arg4 ); } \
76
else if ( (void *)arg1 > (void *)arg4 ) \
77
{ assert ( (void *)arg4 + count <= (void *)arg1 ); } \
78
if ( (void *)arg2 < (void *)arg4 ) \
79
{ assert ( (void *)arg2 + count <= (void *)arg4 ); } \
80
else if ( (void *)arg2 > (void *)arg4 ) \
81
{ assert ( (void *)arg4 + count <= (void *)arg2 ); } \
82
if ( (void *)arg4 < (void *)arg3 ) \
83
{ assert ( (void *)arg4 + count <= (void *)arg3 ); } \
84
else if ( (void *)arg4 > (void *)arg3 ) \
85
{ assert ( (void *)arg3 + count <= (void *)arg4 ); }
86
87
88
89
#define NE10_CHECKPOINTER_DstAccSrcCst { \
90
NE10_CHECKPOINTER_3POINTER(dst, acc, src); }
91
92
#define NE10_CHECKPOINTER_DstCst {}
93
94
#define NE10_CHECKPOINTER_DstSrc1Src2 { \
95
NE10_CHECKPOINTER_3POINTER(dst, src1, src2); }
96
97
#define NE10_CHECKPOINTER_DstAccSrc1Src2 { \
98
NE10_CHECKPOINTER_4POINTER(dst, acc, src1, src2); }
99
100
101
// Main Loop = The loop where the number of items to be processed is exactly the
102
// number that we can process in a single iteration.
103
//
104
// Secondary Loop = The loop that follows a Main Loop to fill in the entries that
105
// did not fit into the Main Loop. This is needed when the number of
106
// input items is not a multiple of the number of items that we
107
// process in every iteration of the Main Loop.
108
109
110
/****************************************************
111
* *
112
* The "DstSrcCst" group of functions *
113
* *
114
****************************************************/
115
117
118
#define NE10_DstSrcCst_MAINLOOP_FLOAT_NEON(loopCode) { \
119
/* load 4 values */
\
120
n_src = vld1q_f32( (float32_t*)src ); \
121
src += 4;
/* move to the next 4 float items; 4*float */
\
122
loopCode;
/* the actual operation is placed here... */
/* The main loop iterates through four float values each time */
\
123
vst1q_f32 ( (float32_t*)dst , n_dst );
/* store the results back */
\
124
dst += 4;
/* move to the next items; 4*float */
\
125
}
126
127
#define NE10_DstSrcCst_SECONDLOOP_FLOAT_NEON(loopCode) { \
128
float32x2_t n_rest = { 0.0f , 0.0f };
/* temporary storage to be used with NEON load/store intrinsics */
\
129
float32x2_t n_rest_cst = { cst, cst };
/* temporary constant value for use in the main NEON operation */
\
130
n_rest = vld1_lane_f32 ( (float32_t*)src, n_rest, 0);
/* load into the first lane of d0 */
\
131
loopCode;
/* the actual operation is placed here ... */
/* exceptional cases where the count is not a multiple of 4 */
\
132
vst1_lane_f32( (float32_t*)dst, n_rest, 0);
/* store the lane back into the memory */
\
133
/* move to the next item in the stream */
\
134
src++; \
135
dst++; \
136
}
137
138
#define NE10_DstSrcCst_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
139
ne10_result_t res = NE10_OK; \
140
float32x4_t n_src; \
141
float32x4_t n_dst; \
142
int dif = 0; \
143
dif = count % 4;
/* either 0 or one of 1,2,3; in the latter cases the second path is taken */
\
144
for (; count > dif; count -= 4) { \
145
loopCode1; \
146
} \
147
if ( 0 != dif ) { \
148
unsigned int idx; \
149
for ( idx = 0 ; idx < dif; idx++ ) { \
150
loopCode2; \
151
} \
152
} \
153
return res; \
154
}
155
157
158
#define NE10_DstSrcCst_MAINLOOP_VEC2F_NEON(loopCode) { \
159
n_src = vld1q_f32( (float32_t*)src );
/* load two vectors */
\
160
src += 2;
/* move to the next two vectors */
\
161
loopCode;
/* actual operation */
/* The main loop iterates through two 2D vectors each time */
\
162
vst1q_f32 ( (float32_t*)dst , n_dst );
/* store back */
\
163
dst += 2;
/* move to the next 2 vectors */
\
164
}
165
166
#define NE10_DstSrcCst_SECONDLOOP_VEC2F_NEON(loopCode) { \
167
float32x2_t n_rest; \
168
float32x2_t n_rest_cst = { cst->x, cst->y }; \
169
n_rest = vld1_f32( (float32_t*)src ); \
170
loopCode;
/* exceptional cases where the count isn't a multiple of 2 */
\
171
vst1_f32( (float32_t*)dst, n_rest); \
172
}
173
174
#define NE10_DstSrcCst_OPERATION_VEC2F_NEON(loopCode1, loopCode2) { \
175
ne10_result_t res = NE10_OK; \
176
float32x4_t n_cst = { cst->x, cst->y, cst->x, cst->y }; \
177
float32x4_t n_src; \
178
float32x4_t n_dst; \
179
int dif = count % 2; \
180
for (; count > dif; count -= 2) { \
181
loopCode1; \
182
} \
183
if ( 0 != dif ) { \
184
loopCode2; \
185
} \
186
return res; \
187
}
188
190
191
#define NE10_DstSrcCst_MAINLOOP_VEC3F_NEON(loopCode) { \
192
n_src1 = vld1q_f32( (float32_t*)src ); \
193
src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
194
n_src2 = vld1q_f32( (float32_t*)src ); \
195
src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
196
n_src3 = vld1q_f32( (float32_t*)src ); \
197
src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
198
loopCode;
/* The main loop iterates through three 3D vectors each time */
\
199
vst1q_f32 ( (float32_t*)dst , n_dst1 ); \
200
dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
201
vst1q_f32 ( (float32_t*)dst , n_dst2 ); \
202
dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
203
vst1q_f32 ( (float32_t*)dst , n_dst3 ); \
204
dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
205
}
206
207
#define NE10_DstSrcCst_SECONDLOOP_VEC3F_NEON(loopCode) { \
208
float32x2x3_t n_rest = FLOAT32_2x3( \
209
0.0f, 0.0f, 0.0f , 0.0f, 0.0f , 0.0f); \
210
float32x2x3_t n_rest_cst = { (const float32x2_t){cst->x, 0}, \
211
(const float32x2_t){cst->y, 0}, (const float32x2_t){cst->z, 0} }; \
212
n_rest = vld3_lane_f32 ( (float32_t*)src, n_rest, 0); \
213
loopCode;
/* exceptional cases where the count isn't a multiple of 3 */
\
214
vst3_lane_f32( (float32_t*)dst, n_rest, 0); \
215
src++; \
216
dst++; \
217
}
218
219
#define NE10_DstSrcCst_OPERATION_VEC3F_NEON(loopCode1, loopCode2) { \
220
ne10_result_t res = NE10_OK; \
221
float32x4_t n_cst1 = { cst->x, cst->y, cst->z, cst->x }; \
222
float32x4_t n_cst2 = { cst->y, cst->z, cst->x, cst->y }; \
223
float32x4_t n_cst3 = { cst->z, cst->x, cst->y, cst->z }; \
224
float32x4_t n_src1, n_src2, n_src3; \
225
float32x4_t n_dst1, n_dst2, n_dst3; \
226
int dif = count % 4; \
227
for (; count > dif; count -= 4) { \
228
loopCode1; \
229
} \
230
if ( 0 != dif ) { \
231
unsigned int idx; \
232
for ( idx = 0 ; idx < dif; idx++ ) { \
233
loopCode2; \
234
} \
235
} \
236
return res; \
237
}
238
240
241
/* Note that for the VEC4* types, we do not need a second loop as the number
242
of input items is always a multiple of four. */
243
244
#define NE10_DstSrcCst_MAINLOOP_VEC4F_NEON(loopCode) { \
245
n_src = vld1q_f32( (float32_t*)src ); \
246
src ++; \
247
loopCode; \
248
vst1q_f32 ( (float32_t*)dst , n_dst );
/* The main loop iterates through one 4D vector each time */
\
249
dst ++; \
250
}
251
252
#define NE10_DstSrcCst_OPERATION_VEC4F_NEON(loopCode) { \
253
ne10_result_t res = NE10_OK; \
254
float32x4_t n_cst = { cst->x, cst->y, cst->z, cst->w }; \
255
float32x4_t n_src; \
256
float32x4_t n_dst; \
257
for (; count != 0; count --) { \
258
loopCode; \
259
} \
260
return res; \
261
}
262
263
/****************************************************
264
* *
265
* The "DstAccSrcCst" group of functions *
266
* *
267
****************************************************/
268
270
271
#define NE10_DstAccSrcCst_MAINLOOP_FLOAT_NEON(loopCode) { \
272
/* load 4 values */
\
273
n_acc = vld1q_f32( (float32_t*)acc ); \
274
n_src = vld1q_f32( (float32_t*)src ); \
275
acc += 4;
/* move to the next 4 float items; 4*float */
\
276
src += 4; \
277
loopCode;
/* the actual operation is placed here... */
/* The main loop iterates through four float values each time */
\
278
vst1q_f32 ( (float32_t*)dst , n_dst );
/* store theresults back */
\
279
dst += 4;
/* move to the next items; 4*float */
\
280
}
281
282
#define NE10_DstAccSrcCst_SECONDLOOP_FLOAT_NEON(loopCode) { \
283
float32x2_t n_rest_acc = { 0.0f , 0.0f };
/* temporary storage to be used with NEON load/store intrinsics */
\
284
float32x2_t n_rest = { 0.0f , 0.0f };
/* temporary storage to be used with NEON load/store intrinsics */
\
285
float32x2_t n_rest_cst = { cst, cst };
/* temporary constant value for use in the main NEON operation */
\
286
n_rest_acc = vld1_lane_f32 ( (float32_t*)acc, n_rest_acc, 0);
/* load into the first lane of d0 */
\
287
n_rest = vld1_lane_f32 ( (float32_t*)src, n_rest, 0);
/* load into the first lane of d1 */
\
288
loopCode;
/* the actual operation is palced here ... */
/* exceptional cases where the count is not a multiple of 4 */
\
289
vst1_lane_f32( (float32_t*)dst, n_rest, 0);
/* store the lane back into the memory */
\
290
/* move to the next item in the stream */
\
291
acc++; \
292
src++; \
293
dst++; \
294
}
295
296
#define NE10_DstAccSrcCst_OPERATION_FLOAT_NEON NE10_DstSrcCst_OPERATION_FLOAT_NEON
297
299
300
#define NE10_DstAccSrcCst_MAINLOOP_VEC2F_NEON(loopCode) { \
301
n_acc = vld1q_f32( (float32_t*)acc );
/* load two vectors */
\
302
n_src = vld1q_f32( (float32_t*)src );
/* load two vectors */
\
303
acc += 2;
/* move to the next two vectors */
\
304
src += 2; \
305
loopCode;
/* actual operation */
/* The main loop iterates through two 2D vectors each time */
\
306
vst1q_f32 ( (float32_t*)dst , n_dst );
/* store back */
\
307
dst += 2;
/* move to the next 2 vectors */
\
308
}
309
310
#define NE10_DstAccSrcCst_SECONDLOOP_VEC2F_NEON(loopCode) { \
311
float32x2_t n_rest_acc; \
312
float32x2_t n_rest; \
313
float32x2_t n_rest_cst = { cst->x, cst->y }; \
314
n_rest_acc = vld1_f32( (float32_t*)acc ); \
315
n_rest = vld1_f32( (float32_t*)src ); \
316
loopCode;
/* exceptional cases where the count isn't a multiple of 2 */
\
317
vst1_f32( (float32_t*)dst, n_rest); \
318
}
319
320
#define NE10_DstAccSrcCst_OPERATION_VEC2F_NEON NE10_DstSrcCst_OPERATION_VEC2F_NEON
321
323
324
#define NE10_DstAccSrcCst_MAINLOOP_VEC3F_NEON(loopCode) { \
325
n_acc1 = vld1q_f32( (float32_t*)acc );
/* Load accumulator values */
\
326
acc = ((void*)acc)+(4*sizeof(ne10_float32_t)); \
327
n_acc2 = vld1q_f32( (float32_t*)acc ); \
328
acc = ((void*)acc)+(4*sizeof(ne10_float32_t)); \
329
n_acc3 = vld1q_f32( (float32_t*)acc ); \
330
acc = ((void*)acc)+(4*sizeof(ne10_float32_t)); \
331
n_src1 = vld1q_f32( (float32_t*)src );
/* Load source values */
\
332
src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
333
n_src2 = vld1q_f32( (float32_t*)src ); \
334
src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
335
n_src3 = vld1q_f32( (float32_t*)src ); \
336
src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
337
loopCode;
/* The main loop iterates through three 3D vectors each time */
\
338
vst1q_f32 ( (float32_t*)dst , n_dst1 );
/* Store the results back into the memory */
\
339
dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
340
vst1q_f32 ( (float32_t*)dst , n_dst2 ); \
341
dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
342
vst1q_f32 ( (float32_t*)dst , n_dst3 ); \
343
dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
344
}
345
346
#define NE10_DstAccSrcCst_SECONDLOOP_VEC3F_NEON(loopCode) { \
347
float32x2x3_t n_rest_acc = FLOAT32_2x3( \
348
0.0f, 0.0f, \
349
0.0f, 0.0f, \
350
0.0f, 0.0f \
351
); \
352
float32x2x3_t n_rest = FLOAT32_2x3( \
353
0.0f, 0.0f, \
354
0.0f, 0.0f, \
355
0.0f, 0.0f \
356
); \
357
float32x2x3_t n_rest_cst = { (const float32x2_t){cst->x, 0}, \
358
(const float32x2_t){cst->y, 0}, \
359
(const float32x2_t){cst->z, 0} }; \
360
n_rest_acc = vld3_lane_f32 ( (float32_t*)acc, n_rest_acc, 0); \
361
n_rest = vld3_lane_f32 ( (float32_t*)src, n_rest, 0); \
362
loopCode;
/* exceptional cases where the count isn't a multiple of 3 */
\
363
vst3_lane_f32( (float32_t*)dst, n_rest, 0); \
364
acc++; \
365
src++; \
366
dst++; \
367
}
368
369
#define NE10_DstAccSrcCst_OPERATION_VEC3F_NEON NE10_DstSrcCst_OPERATION_VEC3F_NEON
370
372
373
#define NE10_DstAccSrcCst_MAINLOOP_VEC4F_NEON(loopCode) { \
374
n_acc = vld1q_f32( (float32_t*)acc ); \
375
n_src = vld1q_f32( (float32_t*)src ); \
376
acc ++; \
377
src ++; \
378
loopCode; \
379
vst1q_f32 ( (float32_t*)dst , n_dst );
/* The main loop iterates through one 4D vector each time */
\
380
dst ++; \
381
}
382
383
#define NE10_DstAccSrcCst_OPERATION_VEC4F_NEON NE10_DstSrcCst_OPERATION_VEC4F_NEON
384
385
/****************************************************
386
* *
387
* The "DstCst" group of functions *
388
* *
389
****************************************************/
390
392
393
#define NE10_DstCst_MAINLOOP_FLOAT_NEON(loopCode) { \
394
/* load 4 values */
\
395
loopCode;
/* the actual operation is placed here... */
/* The main loop iterates through four float values each time */
\
396
vst1q_f32 ( (float32_t*)dst , n_cst );
/* store theresults back */
\
397
dst += 4;
/* move to the next items; 4*float */
\
398
}
399
400
#define NE10_DstCst_SECONDLOOP_FLOAT_NEON(loopCode) { \
401
float32x2_t n_rest_cst = { cst, cst };
/* temporary constant value for use in the main NEON operation */
\
402
loopCode;
/* the actual operation is palced here ... */
/* exceptional cases where the count is not a multiple of 4 */
\
403
vst1_lane_f32( (float32_t*)dst, n_rest_cst, 0);
/* store the lane back into the memory */
\
404
/* move to the next item in the stream */
\
405
dst++; \
406
}
407
408
#define NE10_DstCst_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
409
ne10_result_t res = NE10_OK; \
410
int dif = 0; \
411
dif = count % 4;
/* either 0 or one of 1,2,3; in the latter cases the second path is taken */
\
412
for (; count > dif; count -= 4) { \
413
loopCode1; \
414
} \
415
if ( 0 != dif ) { \
416
unsigned int idx; \
417
for ( idx = 0 ; idx < dif; idx++ ) { \
418
loopCode2; \
419
} \
420
} \
421
return res; \
422
}
423
425
426
427
#define NE10_DstCst_MAINLOOP_VEC2F_NEON(loopCode) { \
428
loopCode;
/* actual operation */
/* The main loop iterates through two 2D vectors each time */
\
429
vst1q_f32 ( (float32_t*)dst , n_cst );
/* store back */
\
430
dst += 2;
/* move to the next 2 vectors */
\
431
}
432
433
#define NE10_DstCst_SECONDLOOP_VEC2F_NEON(loopCode) { \
434
float32x2_t n_rest_cst = { cst->x, cst->y }; \
435
loopCode;
/* exceptional cases where the count isn't a multiple of 2 */
\
436
vst1_f32( (float32_t*)dst, n_rest_cst); \
437
}
438
439
#define NE10_DstCst_OPERATION_VEC2F_NEON(loopCode1, loopCode2) { \
440
ne10_result_t res = NE10_OK; \
441
float32x4_t n_cst = { cst->x, cst->y, cst->x, cst->y }; \
442
int dif = count % 2; \
443
for (; count > dif; count -= 2) { \
444
loopCode1; \
445
} \
446
if ( 0 != dif ) { \
447
loopCode2; \
448
} \
449
return res; \
450
}
451
453
454
#define NE10_DstCst_MAINLOOP_VEC3F_NEON(loopCode) { \
455
loopCode;
/* The main loop iterates through three 3D vectors each time */
\
456
vst1q_f32 ( (float32_t*)dst , n_cst1 ); \
457
dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
458
vst1q_f32 ( (float32_t*)dst , n_cst2 ); \
459
dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
460
vst1q_f32 ( (float32_t*)dst , n_cst3 ); \
461
dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
462
}
463
464
#define NE10_DstCst_SECONDLOOP_VEC3F_NEON(loopCode) { \
465
float32x2x3_t n_rest_cst = { (const float32x2_t){cst->x, 0}, \
466
(const float32x2_t){cst->y, 0}, (const float32x2_t){cst->z, 0} }; \
467
loopCode;
/* exceptional cases where the count isn't a multiple of 3 */
\
468
vst3_lane_f32( (float32_t*)dst, n_rest_cst, 0); \
469
dst++; \
470
}
471
472
#define NE10_DstCst_OPERATION_VEC3F_NEON(loopCode1, loopCode2) { \
473
ne10_result_t res = NE10_OK; \
474
float32x4_t n_cst1 = { cst->x, cst->y, cst->z, cst->x }; \
475
float32x4_t n_cst2 = { cst->y, cst->z, cst->x, cst->y }; \
476
float32x4_t n_cst3 = { cst->z, cst->x, cst->y, cst->z }; \
477
int dif = count % 4; \
478
for (; count > dif; count -= 4) { \
479
loopCode1; \
480
} \
481
if ( 0 != dif ) { \
482
unsigned int idx; \
483
for ( idx = 0 ; idx < dif; idx++ ) { \
484
loopCode2; \
485
} \
486
} \
487
return res; \
488
}
489
491
492
#define NE10_DstCst_MAINLOOP_VEC4F_NEON(loopCode) { \
493
loopCode; \
494
vst1q_f32 ( (float32_t*)dst , n_cst );
/* The main loop iterates through one 4D vector each time */
\
495
dst ++; \
496
}
497
498
#define NE10_DstCst_OPERATION_VEC4F_NEON(loopCode) { \
499
ne10_result_t res = NE10_OK; \
500
float32x4_t n_cst = { cst->x, cst->y, cst->z, cst->w }; \
501
for (; count != 0; count --) { \
502
loopCode; \
503
} \
504
return res; \
505
}
506
507
/****************************************************
508
* *
509
* The "DstSrc1Src2" group of functions *
510
* *
511
****************************************************/
512
514
515
#define NE10_DstSrc1Src2_MAINLOOP_FLOAT_NEON(loopCode) { \
516
/* load 4 values */
\
517
n_src = vld1q_f32( (float32_t*)src1 ); \
518
src1 += 4;
/* move to the next 4 float items; 4*float */
\
519
n_src2 = vld1q_f32( (float32_t*)src2 ); \
520
src2 += 4;
/* move to the next 4 float items; 4*float */
\
521
loopCode;
/* the actual operation is placed here... */
/* The main loop iterates through four float values each time */
\
522
vst1q_f32 ( (float32_t*)dst , n_dst );
/* store the results back */
\
523
dst += 4;
/* move to the next items; 4*float */
\
524
}
525
526
#define NE10_DstSrc1Src2_SECONDLOOP_FLOAT_NEON(loopCode) { \
527
float32x2_t n_rest = { 0.0f , 0.0f };
/* temporary storage to be used with NEON load/store intrinsics */
\
528
float32x2_t n_rest2 = { 0.0f , 0.0f }; \
529
n_rest = vld1_lane_f32 ( (float32_t*)src1, n_rest, 0);
/* load into the first lane of d0 */
\
530
n_rest2 = vld1_lane_f32 ( (float32_t*)src2, n_rest, 0); \
531
loopCode;
/* the actual operation is placed here ... */
/* exceptional cases where the count is not a multiple of 4 */
\
532
vst1_lane_f32( (float32_t*)dst, n_rest, 0);
/* store the lane back into the memory */
\
533
/* move to the next item in the stream */
\
534
src1++; \
535
src2++; \
536
dst++; \
537
}
538
539
#define NE10_DstSrc1Src2_OPERATION_FLOAT_NEON NE10_DstSrcCst_OPERATION_FLOAT_NEON
540
541
/****************************************************
542
* *
543
* The "DstAccSrc1Src2" group of functions *
544
* *
545
****************************************************/
546
548
549
#define NE10_DstAccSrc1Src2_MAINLOOP_FLOAT_NEON(loopCode) { \
550
/* load 4 values */
\
551
n_acc = vld1q_f32( (float32_t*)acc ); \
552
n_src = vld1q_f32( (float32_t*)src1 ); \
553
n_src2 = vld1q_f32( (float32_t*)src2 ); \
554
acc += 4;
/* move to the next 4 float items; 4*float */
\
555
src1 += 4; \
556
src2 += 4; \
557
loopCode;
/* the actual operation is placed here... */
/* The main loop iterates through four float values each time */
\
558
vst1q_f32 ( (float32_t*)dst , n_dst );
/* store theresults back */
\
559
dst += 4;
/* move to the next items; 4*float */
\
560
}
561
562
#define NE10_DstAccSrc1Src2_SECONDLOOP_FLOAT_NEON(loopCode) { \
563
float32x2_t n_rest_acc = { 0.0f , 0.0f };
/* temporary storage to be used with NEON load/store intrinsics */
\
564
float32x2_t n_rest = { 0.0f , 0.0f }; \
565
float32x2_t n_rest2 = { 0.0f, 0.0f }; \
566
n_rest_acc = vld1_lane_f32 ( (float32_t*)acc, n_rest_acc, 0);
/* load into the first lane of d0 */
\
567
n_rest = vld1_lane_f32 ( (float32_t*)src1, n_rest, 0);
/* load into the first lane of d1 */
\
568
n_rest2 = vld1_lane_f32 ( (float32_t*)src2, n_rest2, 0);
/* load into the first lane of d2 */
\
569
loopCode;
/* the actual operation is palced here ... */
/* exceptional cases where the count is not a multiple of 4 */
\
570
vst1_lane_f32( (float32_t*)dst, n_rest, 0);
/* store the lane back into the memory */
\
571
/* move to the next item in the stream */
\
572
acc++; \
573
src1++; \
574
src2++; \
575
dst++; \
576
}
577
578
#define NE10_DstAccSrc1Src2_OPERATION_FLOAT_NEON NE10_DstAccSrcCst_OPERATION_FLOAT_NEON
Generated on Fri Jun 30 2017 10:50:54 for Project Ne10 by
1.8.11