33 #define INTER_RESIZE_COEF_BITS (11) 34 #define INTER_RESIZE_COEF_SCALE (1 << INTER_RESIZE_COEF_BITS) 35 #define BITS (INTER_RESIZE_COEF_BITS*2) 36 #define DELTA (1 << (INTER_RESIZE_COEF_BITS*2 - 1)) 39 const int* xofs,
const short* alpha,
40 int swidth,
int dwidth,
int cn,
int xmin,
int xmax)
45 int16x4x2_t alpha_vec;
47 uint8x8_t dS0_vec, dS1_vec;
48 int16x8_t qS0_vec, qS1_vec;
49 int16x4_t dS0_0123, dS0_4567, dS1_0123, dS1_4567;
51 int32x4_t qT0_vec, qT1_vec;
56 for (k = 0; k <= count - 2; k++)
58 const unsigned char *S0 = src[k], *S1 = src[k + 1];
59 int *D0 = dst[k], *
D1 = dst[k + 1];
61 for (dx = dx0; dx < xmax; dx += 4)
65 alpha_vec = vld2_s16 (&alpha[dx * 2]);
67 dS0_vec = vld1_u8 (&S0[sx]);
68 dS1_vec = vld1_u8 (&S1[sx]);
70 qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));
71 qS1_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS1_vec));
73 dS0_0123 = vget_low_s16 (qS0_vec);
74 dS0_4567 = vget_high_s16 (qS0_vec);
75 dS1_0123 = vget_low_s16 (qS1_vec);
76 dS1_4567 = vget_high_s16 (qS1_vec);
78 qT0_vec = vmull_s16 (dS0_0123, alpha_vec.val[0]);
79 qT1_vec = vmull_s16 (dS1_0123, alpha_vec.val[0]);
80 qT0_vec = vmlal_s16 (qT0_vec, dS0_4567, alpha_vec.val[1]);
81 qT1_vec = vmlal_s16 (qT1_vec, dS1_4567, alpha_vec.val[1]);
83 vst1q_s32 (&D0[dx], qT0_vec);
84 vst1q_s32 (&D1[dx], qT1_vec);
87 for (; dx < dwidth; dx += 4)
91 dS0_vec = vld1_u8 (&S0[sx]);
92 dS1_vec = vld1_u8 (&S1[sx]);
94 qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));
95 qS1_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS1_vec));
97 dS0_0123 = vget_low_s16 (qS0_vec);
98 dS1_0123 = vget_low_s16 (qS1_vec);
100 qT0_vec = vmull_s16 (dS0_0123, dCoeff);
101 qT1_vec = vmull_s16 (dS1_0123, dCoeff);
103 vst1q_s32 (&D0[dx], qT0_vec);
104 vst1q_s32 (&D1[dx], qT1_vec);
108 for (; k < count; k++)
110 const unsigned char *S = src[k];
112 for (dx = 0; dx < xmax; dx += 4)
116 alpha_vec = vld2_s16 (&alpha[dx * 2]);
118 dS0_vec = vld1_u8 (&S[sx]);
119 qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));
121 dS0_0123 = vget_low_s16 (qS0_vec);
122 dS0_4567 = vget_high_s16 (qS0_vec);
124 qT0_vec = vmull_s16 (dS0_0123, alpha_vec.val[0]);
125 qT0_vec = vmlal_s16 (qT0_vec, dS0_4567, alpha_vec.val[1]);
127 vst1q_s32 (&D[dx], qT0_vec);
130 for (; dx < dwidth; dx += 4)
134 dS0_vec = vld1_u8 (&S[sx]);
135 qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));
136 dS0_0123 = vget_low_s16 (qS0_vec);
137 qT0_vec = vmull_s16 (dS0_0123, dCoeff);
139 vst1q_s32 (&D[dx], qT0_vec);
147 const int *S0 = src[0], *S1 = src[1];
149 int32x4_t qS0_0123, qS0_4567, qS1_0123, qS1_4567;
150 int32x4_t qT_0123, qT_4567;
151 int16x4_t dT_0123, dT_4567;
152 uint16x8_t qT_01234567;
153 uint8x8_t dT_01234567, dDst_01234567;
155 int32x2_t dBeta = {};
156 dBeta = vset_lane_s32 ( (
int) (beta[0]), dBeta, 0);
157 dBeta = vset_lane_s32 ( (
int) (beta[1]), dBeta, 1);
159 int32x4_t qDelta, qMin, qMax;
160 qDelta = vdupq_n_s32 (
DELTA);
161 qMin = vdupq_n_s32 (0);
162 qMax = vdupq_n_s32 (255);
165 for (; x <= width - 8; x += 8)
167 qS0_0123 = vld1q_s32 (&S0[x]);
168 qS0_4567 = vld1q_s32 (&S0[x + 4]);
169 qS1_0123 = vld1q_s32 (&S1[x]);
170 qS1_4567 = vld1q_s32 (&S1[x + 4]);
172 qT_0123 = vmulq_lane_s32 (qS0_0123, dBeta, 0);
173 qT_4567 = vmulq_lane_s32 (qS0_4567, dBeta, 0);
174 qT_0123 = vmlaq_lane_s32 (qT_0123, qS1_0123, dBeta, 1);
175 qT_4567 = vmlaq_lane_s32 (qT_4567, qS1_4567, dBeta, 1);
177 qT_0123 = vaddq_s32 (qT_0123, qDelta);
178 qT_4567 = vaddq_s32 (qT_4567, qDelta);
180 qT_0123 = vshrq_n_s32 (qT_0123,
BITS);
181 qT_4567 = vshrq_n_s32 (qT_4567,
BITS);
183 qT_0123 = vmaxq_s32 (qT_0123, qMin);
184 qT_4567 = vmaxq_s32 (qT_4567, qMin);
185 qT_0123 = vminq_s32 (qT_0123, qMax);
186 qT_4567 = vminq_s32 (qT_4567, qMax);
188 dT_0123 = vmovn_s32 (qT_0123);
189 dT_4567 = vmovn_s32 (qT_4567);
190 qT_01234567 = vreinterpretq_u16_s16 (vcombine_s16 (dT_0123, dT_4567));
191 dT_01234567 = vmovn_u16 (qT_01234567);
193 vst1_u8 (&dst[x], dT_01234567);
200 dDst_01234567 = vld1_u8 (&dst[x]);
202 qS0_0123 = vld1q_s32 (&S0[x]);
203 qS0_4567 = vld1q_s32 (&S0[x + 4]);
204 qS1_0123 = vld1q_s32 (&S1[x]);
205 qS1_4567 = vld1q_s32 (&S1[x + 4]);
207 qT_0123 = vmulq_lane_s32 (qS0_0123, dBeta, 0);
208 qT_4567 = vmulq_lane_s32 (qS0_4567, dBeta, 0);
209 qT_0123 = vmlaq_lane_s32 (qT_0123, qS1_0123, dBeta, 1);
210 qT_4567 = vmlaq_lane_s32 (qT_4567, qS1_4567, dBeta, 1);
212 qT_0123 = vaddq_s32 (qT_0123, qDelta);
213 qT_4567 = vaddq_s32 (qT_4567, qDelta);
215 qT_0123 = vshrq_n_s32 (qT_0123,
BITS);
216 qT_4567 = vshrq_n_s32 (qT_4567,
BITS);
218 qT_0123 = vmaxq_s32 (qT_0123, qMin);
219 qT_4567 = vmaxq_s32 (qT_4567, qMin);
220 qT_0123 = vminq_s32 (qT_0123, qMax);
221 qT_4567 = vminq_s32 (qT_4567, qMax);
223 dT_0123 = vmovn_s32 (qT_0123);
224 dT_4567 = vmovn_s32 (qT_4567);
225 qT_01234567 = vreinterpretq_u16_s16 (vcombine_s16 (dT_0123, dT_4567));
226 dT_01234567 = vmovn_u16 (qT_01234567);
228 dMask = vbsl_u8 (dMask, dT_01234567, dDst_01234567);
229 vst1_u8 (&dst[x], dMask);
void ne10_img_hresize_4channels_linear_neon(const unsigned char **src, int **dst, int count, const int *xofs, const short *alpha, int swidth, int dwidth, int cn, int xmin, int xmax)
#define INTER_RESIZE_COEF_SCALE
const ne10_uint64_t ne10_img_vresize_linear_mask_residual_table[NE10_VRESIZE_LINEAR_MASK_TABLE_SIZE]
void ne10_img_vresize_linear_neon(const int **src, unsigned char *dst, const short *beta, int width)