93 if (src_sz.
y == 1 || kernel.
x >= (1 << 7) || kernel.
x == 1)
107 assert (kernel.
x > 0);
108 assert ( (kernel.
x <= src_sz.
x) && (kernel.
y <= src_sz.
y));
112 assert ((src_sz.
y > 1) &&
113 (kernel.
x < (1 << 7)) &&
118 int16x8_t mul_vec = vdupq_n_s16 (mul);
120 for (y = 0; y < src_sz.
y; y += 2)
125 if ((src_sz.
y % 2 != 0) && (y == src_sz.
y - 1))
129 const ne10_uint8_t *src_row2 = src + (y + 1) * src_stride;
139 for (x = 0; x < kernel.
x; x++)
141 sum[k] += * (src_row1 + x * RGBA_CH + k);
142 sum[k + 4] += * (src_row2 + x * RGBA_CH + k);
145 *(dst_row1 + border_l * RGBA_CH + k) = sum[k] * mul >>
147 *(dst_row2 + border_l * RGBA_CH + k) = sum[k + 4] * mul >>
153 const ne10_uint8_t *src_pixel1 = src_row1 + (1 + border_l) * RGBA_CH;
154 const ne10_uint8_t *src_pixel2 = src_row2 + (1 + border_l) * RGBA_CH;
155 const ne10_uint8_t *src_pixel_end = src_row1 + (src_sz.
x - border_r) *
157 ne10_uint8_t *dst_pixel1 = dst_row1 + (1 + border_l) * RGBA_CH;
158 ne10_uint8_t *dst_pixel2 = dst_row2 + (1 + border_l) * RGBA_CH;
160 int16x8_t sum_vec = vld1q_s16 (sum);
162 uint16x8_t sum_vec_u;
163 uint8x8_t src_pixel_next_vec, src_pixel_prev_vec;
164 uint32x2_t src_pixel_next_tmp_vec, src_pixel_prev_tmp_vec;
165 uint32x2_t src_pixel_next_tmp_vec_pre, src_pixel_prev_tmp_vec_pre;
166 uint32x2_t dst_pixel_vec;
167 uint8x8_t dst_pixel_tmp_vec;
171 src_pixel_next_tmp_vec = vld1_lane_u32 (
173 src_pixel_next_tmp_vec,
175 src_pixel_prev_tmp_vec = vld1_lane_u32 (
177 src_pixel_prev_tmp_vec,
179 src_pixel_next_tmp_vec = vld1_lane_u32 (
181 src_pixel_next_tmp_vec,
183 src_pixel_prev_tmp_vec = vld1_lane_u32 (
185 src_pixel_prev_tmp_vec,
189 while (src_pixel1 < src_pixel_end)
192 src_pixel_next_tmp_vec_pre = vld1_lane_u32 (
194 src_pixel_next_tmp_vec_pre,
196 src_pixel_prev_tmp_vec_pre = vld1_lane_u32 (
198 src_pixel_prev_tmp_vec_pre,
200 src_pixel_next_tmp_vec_pre = vld1_lane_u32 (
202 src_pixel_next_tmp_vec_pre,
204 src_pixel_prev_tmp_vec_pre = vld1_lane_u32 (
206 src_pixel_prev_tmp_vec_pre,
209 src_pixel_prev_vec = vreinterpret_u8_u32 (src_pixel_prev_tmp_vec);
210 src_pixel_next_vec = vreinterpret_u8_u32 (src_pixel_next_tmp_vec);
212 sum_vec_u = vreinterpretq_u16_s16 (sum_vec);
213 sum_vec_u = vaddw_u8 (sum_vec_u, src_pixel_next_vec);
214 sum_vec_u = vsubw_u8 (sum_vec_u, src_pixel_prev_vec);
215 sum_vec = vreinterpretq_s16_u16 (sum_vec_u);
217 sum_tmp = vqdmulhq_s16 (sum_vec, mul_vec);
218 dst_pixel_tmp_vec = vqmovun_s16 (sum_tmp);
219 dst_pixel_vec = vreinterpret_u32_u8 (dst_pixel_tmp_vec);
220 vst1_lane_u32 ((
ne10_uint32_t *) dst_pixel1, dst_pixel_vec, 0);
221 vst1_lane_u32 ((
ne10_uint32_t *) dst_pixel2, dst_pixel_vec, 1);
223 src_pixel_prev_tmp_vec = src_pixel_prev_tmp_vec_pre;
224 src_pixel_next_tmp_vec = src_pixel_next_tmp_vec_pre;
245 if (kernel.
y == 1 || kernel.
y >= (1 << 7) || src_sz.
x == 1)
259 assert (kernel.
y > 0);
260 assert ( (kernel.
x <= src_sz.
x) && (kernel.
y <= src_sz.
y));
264 assert ( (src_sz.
x > 1) &&
265 (kernel.
y < (1 << 7)) &&
277 "ERROR: buffer allocation fails!\nallocation size: %d\n",
284 for (x = 0; x < src_sz.
x *
RGBA_CH; x++)
289 for (x = 0; x < src_sz.
x; x++)
293 ne10_uint8_t *dst_pixel = dst_col + border_t * dst_stride;
296 for (y = 0; y < kernel.
y; y++)
298 const ne10_uint8_t *src_pixel = src_col + y * src_stride;
302 sum[k] += src_pixel[k];
308 dst_pixel[k] = sum_row[x * RGBA_CH + k] * mul >>
313 const ne10_uint8_t *src_row = src + (1 + border_t) * src_stride;
314 const ne10_uint8_t *src_row_end = src + (src_sz.
y - border_b) *
316 ne10_uint8_t *dst_row = dst + (1 + border_t) * dst_stride;
320 uint16x8_t sum_vec, sum_vec_pre;
322 uint8x8_t src_pixel_prev_vec, src_pixel_next_vec;
323 uint8x8_t src_pixel_prev_vec_pre, src_pixel_next_vec_pre;
324 uint8x8_t dst_pixel_vec;
328 int16x8_t mul_vec = vdupq_n_s16 (mul);
330 if (src_sz.
x % 2 != 0)
333 sum_val_bakcup[k] = sum_row[ (src_sz.
x - 2) * RGBA_CH + k];
334 src_sz_x_adjust = src_sz.
x - 1;
340 while (src_row < src_row_end)
345 src_pixel_prev_vec = vld1_u8 (src_pixel - prev);
346 src_pixel_next_vec = vld1_u8 (src_pixel + next);
348 sum_vec = vld1q_u16 (sum_row);
350 for (x = 0; x < src_sz_x_adjust; x += 2)
354 sum_vec_pre = vld1q_u16 (sum_pre);
356 src_pixel = src_row + (x + 2) * RGBA_CH;
357 src_pixel_prev_vec_pre = vld1_u8 (src_pixel - prev);
358 src_pixel_next_vec_pre = vld1_u8 (src_pixel + next);
360 sum_vec = vaddw_u8 (sum_vec, src_pixel_next_vec);
361 sum_vec = vsubw_u8 (sum_vec, src_pixel_prev_vec);
362 sum_vec_s = vreinterpretq_s16_u16 (sum_vec);
364 sum_vec_s = vqdmulhq_s16 (sum_vec_s, mul_vec);
365 dst_pixel_vec = vqmovun_s16 (sum_vec_s);
366 dst_pixel = dst_row + x *
RGBA_CH;
367 vst1_u8 (dst_pixel, dst_pixel_vec);
368 vst1q_u16 (sum, sum_vec);
370 src_pixel_next_vec = src_pixel_next_vec_pre;
371 src_pixel_prev_vec = src_pixel_prev_vec_pre;
372 sum_vec = sum_vec_pre;
374 src_row += src_stride;
375 dst_row += dst_stride;
378 if (src_sz.
x % 2 != 0)
381 sum_row[ (src_sz.
x - 2) * RGBA_CH + k] = sum_val_bakcup[k];
383 src_row = src + (1 + border_t) * src_stride;
384 dst_row = dst + (1 + border_t) * dst_stride;
387 sum_vec = vld1q_u16 (sum_row + x * RGBA_CH);
389 while (src_row < src_row_end)
393 src_pixel_prev_vec = vld1_u8 (src_pixel - prev);
394 src_pixel_next_vec = vld1_u8 (src_pixel + next);
395 sum_vec = vaddw_u8 (sum_vec, src_pixel_next_vec);
396 sum_vec = vsubw_u8 (sum_vec, src_pixel_prev_vec);
397 sum_vec_s = vreinterpretq_s16_u16 (sum_vec);
400 sum_vec_s = vqdmulhq_s16 (sum_vec_s, mul_vec);
401 dst_pixel_vec = vqmovun_s16 (sum_vec_s);
402 vst1_u8 (dst_pixel, dst_pixel_vec);
404 src_row += src_stride;
405 dst_row += dst_stride;
425 assert (src != 0 && dst != 0);
426 assert (src_sz.
x > 0 && src_sz.
y > 0);
427 assert (src_stride > 0 && dst_stride > 0);
428 assert (kernel.
x > 0 && kernel.
x <= src_sz.
x 429 && kernel.
y > 0 && kernel.
y <= src_sz.
y);
434 anchor.
x = kernel.
x / 2;
435 anchor.
y = kernel.
y / 2;
447 "ERROR: buffer allocation fails!\nallocation size: %d\n",
void ne10_img_boxfilter_col_neon(const ne10_uint8_t *src, ne10_uint8_t *dst, ne10_size_t src_sz, ne10_int32_t src_stride, ne10_int32_t dst_stride, ne10_size_t kernel, ne10_point_t anchor, ne10_int32_t border_t, ne10_int32_t border_b)
void ne10_img_boxfilter_col_border(const ne10_uint8_t *src, ne10_uint8_t *dst, ne10_size_t src_sz, ne10_int32_t src_stride, ne10_int32_t dst_stride, ne10_size_t kernel, ne10_point_t anchor, ne10_int32_t *border_t_ptr, ne10_int32_t *border_b_ptr)
void ne10_img_boxfilter_rgba8888_neon(const ne10_uint8_t *src, ne10_uint8_t *dst, ne10_size_t src_sz, ne10_int32_t src_stride, ne10_int32_t dst_stride, ne10_size_t kernel)
Specific implementation of ne10_img_boxfilter_rgba8888 using NEON SIMD capabilities.
void ne10_img_boxfilter_row_border(const ne10_uint8_t *src, ne10_uint8_t *dst, ne10_size_t src_sz, ne10_int32_t src_stride, ne10_int32_t dst_stride, ne10_size_t kernel, ne10_point_t anchor, ne10_int32_t *border_l_ptr, ne10_int32_t *border_r_ptr)
void ne10_img_boxfilter_col_c(const ne10_uint8_t *src, ne10_uint8_t *dst, ne10_size_t src_sz, ne10_int32_t src_stride, ne10_int32_t dst_stride, ne10_size_t kernel, ne10_point_t anchor, ne10_int32_t border_t, ne10_int32_t border_b)
void ne10_img_boxfilter_row_c(const ne10_uint8_t *src, ne10_uint8_t *dst, ne10_size_t src_sz, ne10_int32_t src_stride, ne10_int32_t dst_stride, ne10_size_t kernel, ne10_point_t anchor, ne10_int32_t border_l, ne10_int32_t border_r)
Structure for point in image.
void ne10_img_boxfilter_row_neon(const ne10_uint8_t *src, ne10_uint8_t *dst, ne10_size_t src_sz, ne10_int32_t src_stride, ne10_int32_t dst_stride, ne10_size_t kernel, ne10_point_t anchor, ne10_int32_t border_l, ne10_int32_t border_r)