Project Ne10
An open, optimized software library for the ARM architecture.
NE10_resize.neon.c
Go to the documentation of this file.
1 /*
2  * Copyright 2013-16 ARM Limited and Contributors.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of ARM Limited nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include <arm_neon.h>
29 
30 #include "NE10.h"
31 #include "NE10_mask_table.h"
32 
33 #define INTER_RESIZE_COEF_BITS (11)
34 #define INTER_RESIZE_COEF_SCALE (1 << INTER_RESIZE_COEF_BITS)
35 #define BITS (INTER_RESIZE_COEF_BITS*2)
36 #define DELTA (1 << (INTER_RESIZE_COEF_BITS*2 - 1))
37 
38 void ne10_img_hresize_4channels_linear_neon (const unsigned char** src, int** dst, int count,
39  const int* xofs, const short* alpha,
40  int swidth, int dwidth, int cn, int xmin, int xmax)
41 {
42  int dx, k;
43  int dx0 = 0;
44 
45  int16x4x2_t alpha_vec;
46 
47  uint8x8_t dS0_vec, dS1_vec;
48  int16x8_t qS0_vec, qS1_vec;
49  int16x4_t dS0_0123, dS0_4567, dS1_0123, dS1_4567;
50 
51  int32x4_t qT0_vec, qT1_vec;
52 
53  int16x4_t dCoeff;
54  dCoeff = vdup_n_s16 (INTER_RESIZE_COEF_SCALE);
55 
56  for (k = 0; k <= count - 2; k++)
57  {
58  const unsigned char *S0 = src[k], *S1 = src[k + 1];
59  int *D0 = dst[k], *D1 = dst[k + 1];
60 
61  for (dx = dx0; dx < xmax; dx += 4)
62  {
63  int sx = xofs[dx];
64 
65  alpha_vec = vld2_s16 (&alpha[dx * 2]);
66 
67  dS0_vec = vld1_u8 (&S0[sx]);
68  dS1_vec = vld1_u8 (&S1[sx]);
69 
70  qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));
71  qS1_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS1_vec));
72 
73  dS0_0123 = vget_low_s16 (qS0_vec);
74  dS0_4567 = vget_high_s16 (qS0_vec);
75  dS1_0123 = vget_low_s16 (qS1_vec);
76  dS1_4567 = vget_high_s16 (qS1_vec);
77 
78  qT0_vec = vmull_s16 (dS0_0123, alpha_vec.val[0]);
79  qT1_vec = vmull_s16 (dS1_0123, alpha_vec.val[0]);
80  qT0_vec = vmlal_s16 (qT0_vec, dS0_4567, alpha_vec.val[1]);
81  qT1_vec = vmlal_s16 (qT1_vec, dS1_4567, alpha_vec.val[1]);
82 
83  vst1q_s32 (&D0[dx], qT0_vec);
84  vst1q_s32 (&D1[dx], qT1_vec);
85  }
86 
87  for (; dx < dwidth; dx += 4)
88  {
89  int sx = xofs[dx];
90 
91  dS0_vec = vld1_u8 (&S0[sx]);
92  dS1_vec = vld1_u8 (&S1[sx]);
93 
94  qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));
95  qS1_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS1_vec));
96 
97  dS0_0123 = vget_low_s16 (qS0_vec);
98  dS1_0123 = vget_low_s16 (qS1_vec);
99 
100  qT0_vec = vmull_s16 (dS0_0123, dCoeff);
101  qT1_vec = vmull_s16 (dS1_0123, dCoeff);
102 
103  vst1q_s32 (&D0[dx], qT0_vec);
104  vst1q_s32 (&D1[dx], qT1_vec);
105  }
106  }
107 
108  for (; k < count; k++)
109  {
110  const unsigned char *S = src[k];
111  int *D = dst[k];
112  for (dx = 0; dx < xmax; dx += 4)
113  {
114  int sx = xofs[dx];
115 
116  alpha_vec = vld2_s16 (&alpha[dx * 2]);
117 
118  dS0_vec = vld1_u8 (&S[sx]);
119  qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));
120 
121  dS0_0123 = vget_low_s16 (qS0_vec);
122  dS0_4567 = vget_high_s16 (qS0_vec);
123 
124  qT0_vec = vmull_s16 (dS0_0123, alpha_vec.val[0]);
125  qT0_vec = vmlal_s16 (qT0_vec, dS0_4567, alpha_vec.val[1]);
126 
127  vst1q_s32 (&D[dx], qT0_vec);
128  }
129 
130  for (; dx < dwidth; dx += 4)
131  {
132  int sx = xofs[dx];
133 
134  dS0_vec = vld1_u8 (&S[sx]);
135  qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));
136  dS0_0123 = vget_low_s16 (qS0_vec);
137  qT0_vec = vmull_s16 (dS0_0123, dCoeff);
138 
139  vst1q_s32 (&D[dx], qT0_vec);
140  }
141  }
142 }
143 
144 
145 void ne10_img_vresize_linear_neon (const int** src, unsigned char* dst, const short* beta, int width)
146 {
147  const int *S0 = src[0], *S1 = src[1];
148 
149  int32x4_t qS0_0123, qS0_4567, qS1_0123, qS1_4567;
150  int32x4_t qT_0123, qT_4567;
151  int16x4_t dT_0123, dT_4567;
152  uint16x8_t qT_01234567;
153  uint8x8_t dT_01234567, dDst_01234567;
154 
155  int32x2_t dBeta = {};
156  dBeta = vset_lane_s32 ( (int) (beta[0]), dBeta, 0);
157  dBeta = vset_lane_s32 ( (int) (beta[1]), dBeta, 1);
158 
159  int32x4_t qDelta, qMin, qMax;
160  qDelta = vdupq_n_s32 (DELTA);
161  qMin = vdupq_n_s32 (0);
162  qMax = vdupq_n_s32 (255);
163 
164  int x = 0;
165  for (; x <= width - 8; x += 8)
166  {
167  qS0_0123 = vld1q_s32 (&S0[x]);
168  qS0_4567 = vld1q_s32 (&S0[x + 4]);
169  qS1_0123 = vld1q_s32 (&S1[x]);
170  qS1_4567 = vld1q_s32 (&S1[x + 4]);
171 
172  qT_0123 = vmulq_lane_s32 (qS0_0123, dBeta, 0);
173  qT_4567 = vmulq_lane_s32 (qS0_4567, dBeta, 0);
174  qT_0123 = vmlaq_lane_s32 (qT_0123, qS1_0123, dBeta, 1);
175  qT_4567 = vmlaq_lane_s32 (qT_4567, qS1_4567, dBeta, 1);
176 
177  qT_0123 = vaddq_s32 (qT_0123, qDelta);
178  qT_4567 = vaddq_s32 (qT_4567, qDelta);
179 
180  qT_0123 = vshrq_n_s32 (qT_0123, BITS);
181  qT_4567 = vshrq_n_s32 (qT_4567, BITS);
182 
183  qT_0123 = vmaxq_s32 (qT_0123, qMin);
184  qT_4567 = vmaxq_s32 (qT_4567, qMin);
185  qT_0123 = vminq_s32 (qT_0123, qMax);
186  qT_4567 = vminq_s32 (qT_4567, qMax);
187 
188  dT_0123 = vmovn_s32 (qT_0123);
189  dT_4567 = vmovn_s32 (qT_4567);
190  qT_01234567 = vreinterpretq_u16_s16 (vcombine_s16 (dT_0123, dT_4567));
191  dT_01234567 = vmovn_u16 (qT_01234567);
192 
193  vst1_u8 (&dst[x], dT_01234567);
194  }
195 
196  if (x < width)
197  {
198  uint8x8_t dMask;
199  dMask = vld1_u8 ( (uint8_t *) (&ne10_img_vresize_linear_mask_residual_table[ (width - x - 1)]));
200  dDst_01234567 = vld1_u8 (&dst[x]);
201 
202  qS0_0123 = vld1q_s32 (&S0[x]);
203  qS0_4567 = vld1q_s32 (&S0[x + 4]);
204  qS1_0123 = vld1q_s32 (&S1[x]);
205  qS1_4567 = vld1q_s32 (&S1[x + 4]);
206 
207  qT_0123 = vmulq_lane_s32 (qS0_0123, dBeta, 0);
208  qT_4567 = vmulq_lane_s32 (qS0_4567, dBeta, 0);
209  qT_0123 = vmlaq_lane_s32 (qT_0123, qS1_0123, dBeta, 1);
210  qT_4567 = vmlaq_lane_s32 (qT_4567, qS1_4567, dBeta, 1);
211 
212  qT_0123 = vaddq_s32 (qT_0123, qDelta);
213  qT_4567 = vaddq_s32 (qT_4567, qDelta);
214 
215  qT_0123 = vshrq_n_s32 (qT_0123, BITS);
216  qT_4567 = vshrq_n_s32 (qT_4567, BITS);
217 
218  qT_0123 = vmaxq_s32 (qT_0123, qMin);
219  qT_4567 = vmaxq_s32 (qT_4567, qMin);
220  qT_0123 = vminq_s32 (qT_0123, qMax);
221  qT_4567 = vminq_s32 (qT_4567, qMax);
222 
223  dT_0123 = vmovn_s32 (qT_0123);
224  dT_4567 = vmovn_s32 (qT_4567);
225  qT_01234567 = vreinterpretq_u16_s16 (vcombine_s16 (dT_0123, dT_4567));
226  dT_01234567 = vmovn_u16 (qT_01234567);
227 
228  dMask = vbsl_u8 (dMask, dT_01234567, dDst_01234567);
229  vst1_u8 (&dst[x], dMask);
230  }
231 }
#define BITS
void ne10_img_hresize_4channels_linear_neon(const unsigned char **src, int **dst, int count, const int *xofs, const short *alpha, int swidth, int dwidth, int cn, int xmin, int xmax)
#define DELTA
#define INTER_RESIZE_COEF_SCALE
#define D1
const ne10_uint64_t ne10_img_vresize_linear_mask_residual_table[NE10_VRESIZE_LINEAR_MASK_TABLE_SIZE]
void ne10_img_vresize_linear_neon(const int **src, unsigned char *dst, const short *beta, int width)