Project Ne10
An open, optimized software library for the ARM architecture.
NE10_boxfilter.neon.c
Go to the documentation of this file.
1 /*
2  * Copyright 2013-16 ARM Limited and Contributors.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of ARM Limited nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /*
29  * NE10 Library : imgproc/NE10_boxfilter.c
30  */
31 
32 #include "NE10.h"
33 #include <stdlib.h>
34 #include <math.h>
35 #include <arm_neon.h>
36 
37 extern void ne10_img_boxfilter_row_border (const ne10_uint8_t* src,
38  ne10_uint8_t* dst,
39  ne10_size_t src_sz,
40  ne10_int32_t src_stride,
41  ne10_int32_t dst_stride,
42  ne10_size_t kernel,
43  ne10_point_t anchor,
44  ne10_int32_t *border_l_ptr,
45  ne10_int32_t *border_r_ptr);
46 
47 extern void ne10_img_boxfilter_col_border (const ne10_uint8_t *src,
48  ne10_uint8_t *dst,
49  ne10_size_t src_sz,
50  ne10_int32_t src_stride,
51  ne10_int32_t dst_stride,
52  ne10_size_t kernel,
53  ne10_point_t anchor,
54  ne10_int32_t *border_t_ptr,
55  ne10_int32_t *border_b_ptr);
56 
57 extern void ne10_img_boxfilter_row_c (const ne10_uint8_t *src,
58  ne10_uint8_t *dst,
59  ne10_size_t src_sz,
60  ne10_int32_t src_stride,
61  ne10_int32_t dst_stride,
62  ne10_size_t kernel,
63  ne10_point_t anchor,
64  ne10_int32_t border_l,
65  ne10_int32_t border_r);
66 
67 extern void ne10_img_boxfilter_col_c (const ne10_uint8_t *src,
68  ne10_uint8_t *dst,
69  ne10_size_t src_sz,
70  ne10_int32_t src_stride,
71  ne10_int32_t dst_stride,
72  ne10_size_t kernel,
73  ne10_point_t anchor,
74  ne10_int32_t border_t,
75  ne10_int32_t border_b);
76 
77 /* RGBA CHANNEL number is 4 */
78 #define RGBA_CH 4
79 /* DIV_SHIFT is used in replacement of constant division */
80 #define DIV_SHIFT 15
81 
83  ne10_uint8_t *dst,
84  ne10_size_t src_sz,
85  ne10_int32_t src_stride,
86  ne10_int32_t dst_stride,
87  ne10_size_t kernel,
88  ne10_point_t anchor,
89  ne10_int32_t border_l,
90  ne10_int32_t border_r)
91 {
92  /* when in special cases, we'll call the c version of row filter */
93  if (src_sz.y == 1 || kernel.x >= (1 << 7) || kernel.x == 1)
94  {
95  return ne10_img_boxfilter_row_c (src,
96  dst,
97  src_sz,
98  src_stride,
99  dst_stride,
100  kernel,
101  anchor,
102  border_l,
103  border_r);
104  }
105 
106  assert (src != dst);
107  assert (kernel.x > 0);
108  assert ( (kernel.x <= src_sz.x) && (kernel.y <= src_sz.y));
109  /* if kernel.x was 1, the mul variable would overflow when it
110  * is casted to ne10_int16_t type.
111  */
112  assert ((src_sz.y > 1) &&
113  (kernel.x < (1 << 7)) &&
114  (kernel.x > 1));
115 
116  ne10_int32_t x, y, k;
117  ne10_int16_t mul = (1 << DIV_SHIFT) / kernel.x;
118  int16x8_t mul_vec = vdupq_n_s16 (mul);
119 
120  for (y = 0; y < src_sz.y; y += 2)
121  {
122  /* step back one row when image height is odd and before reaching last
123  * line
124  */
125  if ((src_sz.y % 2 != 0) && (y == src_sz.y - 1))
126  y--;
127 
128  const ne10_uint8_t *src_row1 = src + y * src_stride;
129  const ne10_uint8_t *src_row2 = src + (y + 1) * src_stride;
130  ne10_uint8_t *dst_row1 = dst + y * dst_stride;
131  ne10_uint8_t *dst_row2 = dst + (y + 1) * dst_stride;
132  ne10_int16_t sum[RGBA_CH * 2];
133 
134  for (k = 0; k < RGBA_CH; k++)
135  {
136  sum[k] = 0;
137  sum[k + 4] = 0;
138 
139  for (x = 0; x < kernel.x; x++)
140  {
141  sum[k] += * (src_row1 + x * RGBA_CH + k);
142  sum[k + 4] += * (src_row2 + x * RGBA_CH + k);
143  }
144 
145  *(dst_row1 + border_l * RGBA_CH + k) = sum[k] * mul >>
146  DIV_SHIFT;
147  *(dst_row2 + border_l * RGBA_CH + k) = sum[k + 4] * mul >>
148  DIV_SHIFT;
149  }
150 
151  ne10_uint32_t prev = (anchor.x + 1) * RGBA_CH;
152  ne10_uint32_t next = (kernel.x - anchor.x - 1) * RGBA_CH;
153  const ne10_uint8_t *src_pixel1 = src_row1 + (1 + border_l) * RGBA_CH;
154  const ne10_uint8_t *src_pixel2 = src_row2 + (1 + border_l) * RGBA_CH;
155  const ne10_uint8_t *src_pixel_end = src_row1 + (src_sz.x - border_r) *
156  RGBA_CH;
157  ne10_uint8_t *dst_pixel1 = dst_row1 + (1 + border_l) * RGBA_CH;
158  ne10_uint8_t *dst_pixel2 = dst_row2 + (1 + border_l) * RGBA_CH;
159 
160  int16x8_t sum_vec = vld1q_s16 (sum);
161  int16x8_t sum_tmp;
162  uint16x8_t sum_vec_u;
163  uint8x8_t src_pixel_next_vec, src_pixel_prev_vec;
164  uint32x2_t src_pixel_next_tmp_vec, src_pixel_prev_tmp_vec;
165  uint32x2_t src_pixel_next_tmp_vec_pre, src_pixel_prev_tmp_vec_pre;
166  uint32x2_t dst_pixel_vec;
167  uint8x8_t dst_pixel_tmp_vec;
168 
169 
170  /* preload */
171  src_pixel_next_tmp_vec = vld1_lane_u32 (
172  (const ne10_uint32_t*) (src_pixel1 + next),
173  src_pixel_next_tmp_vec,
174  0);
175  src_pixel_prev_tmp_vec = vld1_lane_u32 (
176  (const ne10_uint32_t*) (src_pixel1 - prev),
177  src_pixel_prev_tmp_vec,
178  0);
179  src_pixel_next_tmp_vec = vld1_lane_u32 (
180  (const ne10_uint32_t*) (src_pixel2 + next),
181  src_pixel_next_tmp_vec,
182  1);
183  src_pixel_prev_tmp_vec = vld1_lane_u32 (
184  (const ne10_uint32_t*) (src_pixel2 - prev),
185  src_pixel_prev_tmp_vec,
186  1);
187 
188  /* load two rows to do filtering */
189  while (src_pixel1 < src_pixel_end)
190  {
191  /* preload */
192  src_pixel_next_tmp_vec_pre = vld1_lane_u32 (
193  (const ne10_uint32_t*) (src_pixel1 + 4 + next),
194  src_pixel_next_tmp_vec_pre,
195  0);
196  src_pixel_prev_tmp_vec_pre = vld1_lane_u32 (
197  (const ne10_uint32_t*) (src_pixel1 + 4 - prev),
198  src_pixel_prev_tmp_vec_pre,
199  0);
200  src_pixel_next_tmp_vec_pre = vld1_lane_u32 (
201  (const ne10_uint32_t*) (src_pixel2 + 4 + next),
202  src_pixel_next_tmp_vec_pre,
203  1);
204  src_pixel_prev_tmp_vec_pre = vld1_lane_u32 (
205  (const ne10_uint32_t*) (src_pixel2 + 4 - prev),
206  src_pixel_prev_tmp_vec_pre,
207  1);
208 
209  src_pixel_prev_vec = vreinterpret_u8_u32 (src_pixel_prev_tmp_vec);
210  src_pixel_next_vec = vreinterpret_u8_u32 (src_pixel_next_tmp_vec);
211 
212  sum_vec_u = vreinterpretq_u16_s16 (sum_vec);
213  sum_vec_u = vaddw_u8 (sum_vec_u, src_pixel_next_vec);
214  sum_vec_u = vsubw_u8 (sum_vec_u, src_pixel_prev_vec);
215  sum_vec = vreinterpretq_s16_u16 (sum_vec_u);
216  /* vqdmulhq_n_s16 would shift the result 16 bit */
217  sum_tmp = vqdmulhq_s16 (sum_vec, mul_vec);
218  dst_pixel_tmp_vec = vqmovun_s16 (sum_tmp);
219  dst_pixel_vec = vreinterpret_u32_u8 (dst_pixel_tmp_vec);
220  vst1_lane_u32 ((ne10_uint32_t *) dst_pixel1, dst_pixel_vec, 0);
221  vst1_lane_u32 ((ne10_uint32_t *) dst_pixel2, dst_pixel_vec, 1);
222 
223  src_pixel_prev_tmp_vec = src_pixel_prev_tmp_vec_pre;
224  src_pixel_next_tmp_vec = src_pixel_next_tmp_vec_pre;
225 
226  src_pixel1 += 4;
227  src_pixel2 += 4;
228  dst_pixel1 += 4;
229  dst_pixel2 += 4;
230  }
231  }
232 }
233 
235  ne10_uint8_t *dst,
236  ne10_size_t src_sz,
237  ne10_int32_t src_stride,
238  ne10_int32_t dst_stride,
239  ne10_size_t kernel,
240  ne10_point_t anchor,
241  ne10_int32_t border_t,
242  ne10_int32_t border_b)
243 {
244  /* when in special cases, we'll call c version to do the work */
245  if (kernel.y == 1 || kernel.y >= (1 << 7) || src_sz.x == 1)
246  {
247  return ne10_img_boxfilter_col_c (src,
248  dst,
249  src_sz,
250  src_stride,
251  dst_stride,
252  kernel,
253  anchor,
254  border_t,
255  border_b);
256  }
257 
258  assert (src != dst);
259  assert (kernel.y > 0);
260  assert ( (kernel.x <= src_sz.x) && (kernel.y <= src_sz.y));
261  /* if kernel.y was 1, the mul variable would overflow when it
262  * is casted to ne10_int16_t type.
263  */
264  assert ( (src_sz.x > 1) &&
265  (kernel.y < (1 << 7)) &&
266  (kernel.y > 1));
267 
268  ne10_int32_t x, y, k;
269  ne10_uint16_t *sum_row = (ne10_uint16_t *) malloc (src_sz.x *
270  RGBA_CH *
271  sizeof (ne10_uint16_t));
272  ne10_uint16_t mul = (1 << DIV_SHIFT) / kernel.y;
273 
274  if (!sum_row)
275  {
276  fprintf (stderr,
277  "ERROR: buffer allocation fails!\nallocation size: %d\n",
278  sizeof (ne10_uint32_t) *
279  src_sz.x *
280  RGBA_CH);
281  return;
282  }
283 
284  for (x = 0; x < src_sz.x * RGBA_CH; x++)
285  {
286  sum_row[x] = 0;
287  }
288 
289  for (x = 0; x < src_sz.x; x++)
290  {
291  const ne10_uint8_t *src_col = src + x * RGBA_CH;
292  ne10_uint8_t *dst_col = dst + x * RGBA_CH;
293  ne10_uint8_t *dst_pixel = dst_col + border_t * dst_stride;
294  ne10_uint16_t *sum = sum_row + x * RGBA_CH;
295 
296  for (y = 0; y < kernel.y; y++)
297  {
298  const ne10_uint8_t *src_pixel = src_col + y * src_stride;
299 
300  for (k = 0; k < RGBA_CH; k++)
301  {
302  sum[k] += src_pixel[k];
303  }
304  }
305 
306  for (k = 0; k < RGBA_CH; k++)
307  {
308  dst_pixel[k] = sum_row[x * RGBA_CH + k] * mul >>
309  DIV_SHIFT;
310  }
311  }
312 
313  const ne10_uint8_t *src_row = src + (1 + border_t) * src_stride;
314  const ne10_uint8_t *src_row_end = src + (src_sz.y - border_b) *
315  src_stride;
316  ne10_uint8_t *dst_row = dst + (1 + border_t) * dst_stride;
317  ne10_uint32_t prev = (anchor.y + 1) * src_stride;
318  ne10_uint32_t next = (kernel.y - anchor.y - 1) * src_stride;
319 
320  uint16x8_t sum_vec, sum_vec_pre;
321  int16x8_t sum_vec_s;
322  uint8x8_t src_pixel_prev_vec, src_pixel_next_vec;
323  uint8x8_t src_pixel_prev_vec_pre, src_pixel_next_vec_pre;
324  uint8x8_t dst_pixel_vec;
325 
326  ne10_uint16_t sum_val_bakcup[RGBA_CH];
327  ne10_uint32_t src_sz_x_adjust = src_sz.x;
328  int16x8_t mul_vec = vdupq_n_s16 (mul);
329 
330  if (src_sz.x % 2 != 0)
331  {
332  for (k = 0; k < RGBA_CH; k++)
333  sum_val_bakcup[k] = sum_row[ (src_sz.x - 2) * RGBA_CH + k];
334  src_sz_x_adjust = src_sz.x - 1;
335  }
336 
337  /* boxfilter column filter is done once in a row, which
338  * is more friendly to cache than once in a column.
339  */
340  while (src_row < src_row_end)
341  {
342  /* preload */
343  const ne10_uint8_t *src_pixel = src_row;
344  ne10_uint8_t *dst_pixel = dst_row;
345  src_pixel_prev_vec = vld1_u8 (src_pixel - prev);
346  src_pixel_next_vec = vld1_u8 (src_pixel + next);
347  ne10_uint16_t *sum, *sum_pre;
348  sum_vec = vld1q_u16 (sum_row);
349 
350  for (x = 0; x < src_sz_x_adjust; x += 2)
351  {
352  sum = sum_row + x * RGBA_CH;
353  sum_pre = sum + 2 * RGBA_CH;
354  sum_vec_pre = vld1q_u16 (sum_pre);
355  /* preload */
356  src_pixel = src_row + (x + 2) * RGBA_CH;
357  src_pixel_prev_vec_pre = vld1_u8 (src_pixel - prev);
358  src_pixel_next_vec_pre = vld1_u8 (src_pixel + next);
359 
360  sum_vec = vaddw_u8 (sum_vec, src_pixel_next_vec);
361  sum_vec = vsubw_u8 (sum_vec, src_pixel_prev_vec);
362  sum_vec_s = vreinterpretq_s16_u16 (sum_vec);
363  /* vqdmulhq_n_s16 would shift the result 16 bit */
364  sum_vec_s = vqdmulhq_s16 (sum_vec_s, mul_vec);
365  dst_pixel_vec = vqmovun_s16 (sum_vec_s);
366  dst_pixel = dst_row + x * RGBA_CH;
367  vst1_u8 (dst_pixel, dst_pixel_vec);
368  vst1q_u16 (sum, sum_vec);
369 
370  src_pixel_next_vec = src_pixel_next_vec_pre;
371  src_pixel_prev_vec = src_pixel_prev_vec_pre;
372  sum_vec = sum_vec_pre;
373  }
374  src_row += src_stride;
375  dst_row += dst_stride;
376  }
377 
378  if (src_sz.x % 2 != 0)
379  {
380  for (k = 0; k < RGBA_CH; k++)
381  sum_row[ (src_sz.x - 2) * RGBA_CH + k] = sum_val_bakcup[k];
382 
383  src_row = src + (1 + border_t) * src_stride;
384  dst_row = dst + (1 + border_t) * dst_stride;
385  /* step back one column */
386  x = src_sz.x - 2;
387  sum_vec = vld1q_u16 (sum_row + x * RGBA_CH);
388 
389  while (src_row < src_row_end)
390  {
391  const ne10_uint8_t *src_pixel = src_row + x * RGBA_CH;
392  ne10_uint8_t *dst_pixel = dst_row + x * RGBA_CH;
393  src_pixel_prev_vec = vld1_u8 (src_pixel - prev);
394  src_pixel_next_vec = vld1_u8 (src_pixel + next);
395  sum_vec = vaddw_u8 (sum_vec, src_pixel_next_vec);
396  sum_vec = vsubw_u8 (sum_vec, src_pixel_prev_vec);
397  sum_vec_s = vreinterpretq_s16_u16 (sum_vec);
398  /* vqdmulhq_n_s16 would shift the result 16 bit
399  */
400  sum_vec_s = vqdmulhq_s16 (sum_vec_s, mul_vec);
401  dst_pixel_vec = vqmovun_s16 (sum_vec_s);
402  vst1_u8 (dst_pixel, dst_pixel_vec);
403 
404  src_row += src_stride;
405  dst_row += dst_stride;
406  }
407  }
408 
409  free (sum_row);
410 }
411 
419  ne10_uint8_t *dst,
420  ne10_size_t src_sz,
421  ne10_int32_t src_stride,
422  ne10_int32_t dst_stride,
423  ne10_size_t kernel)
424 {
425  assert (src != 0 && dst != 0);
426  assert (src_sz.x > 0 && src_sz.y > 0);
427  assert (src_stride > 0 && dst_stride > 0);
428  assert (kernel.x > 0 && kernel.x <= src_sz.x
429  && kernel.y > 0 && kernel.y <= src_sz.y);
430 
431  ne10_int32_t border_l, border_r, border_t, border_b;
432  ne10_point_t anchor;
433 
434  anchor.x = kernel.x / 2;
435  anchor.y = kernel.y / 2;
436 
437  /* the extra 2 elements here is reserved for pre-load when do row or column
438  * filter */
439  ne10_uint32_t mem_bytes = (sizeof (ne10_uint8_t) * src_sz.x * src_sz.y + 2)
440  * RGBA_CH;
441 
442  ne10_uint8_t *dst_buf = (ne10_uint8_t *) malloc (mem_bytes);
443 
444  if (!dst_buf)
445  {
446  fprintf (stderr,
447  "ERROR: buffer allocation fails!\nallocation size: %d\n",
448  mem_bytes);
449  return;
450  }
451 
452  ne10_int32_t dst_buf_stride = src_sz.x * RGBA_CH;
453 
454  /* compute the row border of dst image */
456  dst_buf,
457  src_sz,
458  src_stride,
459  dst_buf_stride,
460  kernel,
461  anchor,
462  &border_l,
463  &border_r);
464  /* boxfilter is separable filter, and then can be apply row filter and
465  * column filter sequentially. here apply boxfilter's row part to image
466  */
468  dst_buf,
469  src_sz,
470  src_stride,
471  dst_buf_stride,
472  kernel,
473  anchor,
474  border_l,
475  border_r);
476 
477  /* compute the column border of dst image,
478  * which is based on previous row filter result.
479  */
481  dst,
482  src_sz,
483  dst_buf_stride,
484  dst_stride,
485  kernel,
486  anchor,
487  &border_t,
488  &border_b);
489 
490  /* apply boxfilter column filter to image */
492  dst,
493  src_sz,
494  dst_buf_stride,
495  dst_stride,
496  kernel,
497  anchor,
498  border_t,
499  border_b);
500 
501  free (dst_buf);
502 }
void ne10_img_boxfilter_col_neon(const ne10_uint8_t *src, ne10_uint8_t *dst, ne10_size_t src_sz, ne10_int32_t src_stride, ne10_int32_t dst_stride, ne10_size_t kernel, ne10_point_t anchor, ne10_int32_t border_t, ne10_int32_t border_b)
uint8_t ne10_uint8_t
Definition: NE10_types.h:73
void ne10_img_boxfilter_col_border(const ne10_uint8_t *src, ne10_uint8_t *dst, ne10_size_t src_sz, ne10_int32_t src_stride, ne10_int32_t dst_stride, ne10_size_t kernel, ne10_point_t anchor, ne10_int32_t *border_t_ptr, ne10_int32_t *border_b_ptr)
ne10_uint32_t y
Definition: NE10_types.h:440
int32_t ne10_int32_t
Definition: NE10_types.h:76
void ne10_img_boxfilter_rgba8888_neon(const ne10_uint8_t *src, ne10_uint8_t *dst, ne10_size_t src_sz, ne10_int32_t src_stride, ne10_int32_t dst_stride, ne10_size_t kernel)
Specific implementation of ne10_img_boxfilter_rgba8888 using NEON SIMD capabilities.
void ne10_img_boxfilter_row_border(const ne10_uint8_t *src, ne10_uint8_t *dst, ne10_size_t src_sz, ne10_int32_t src_stride, ne10_int32_t dst_stride, ne10_size_t kernel, ne10_point_t anchor, ne10_int32_t *border_l_ptr, ne10_int32_t *border_r_ptr)
ne10_uint32_t x
Definition: NE10_types.h:439
uint16_t ne10_uint16_t
Definition: NE10_types.h:75
uint32_t ne10_uint32_t
Definition: NE10_types.h:77
void ne10_img_boxfilter_col_c(const ne10_uint8_t *src, ne10_uint8_t *dst, ne10_size_t src_sz, ne10_int32_t src_stride, ne10_int32_t dst_stride, ne10_size_t kernel, ne10_point_t anchor, ne10_int32_t border_t, ne10_int32_t border_b)
void ne10_img_boxfilter_row_c(const ne10_uint8_t *src, ne10_uint8_t *dst, ne10_size_t src_sz, ne10_int32_t src_stride, ne10_int32_t dst_stride, ne10_size_t kernel, ne10_point_t anchor, ne10_int32_t border_l, ne10_int32_t border_r)
ne10_uint32_t y
Definition: NE10_types.h:434
#define DIV_SHIFT
Structure for point in image.
Definition: NE10_types.h:431
#define RGBA_CH
int16_t ne10_int16_t
Definition: NE10_types.h:74
void ne10_img_boxfilter_row_neon(const ne10_uint8_t *src, ne10_uint8_t *dst, ne10_size_t src_sz, ne10_int32_t src_stride, ne10_int32_t dst_stride, ne10_size_t kernel, ne10_point_t anchor, ne10_int32_t border_l, ne10_int32_t border_r)
ne10_uint32_t x
Definition: NE10_types.h:433