Project Ne10
An open, optimized software library for the ARM architecture.
NE10_fir.c
Go to the documentation of this file.
1 /*
2  * Copyright 2012-16 ARM Limited and Contributors.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of ARM Limited nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /*
29  * NE10 Library : dsp/NE10_fir.c
30  */
31 
32 #include "NE10_types.h"
33 
112  ne10_float32_t * pSrc,
113  ne10_float32_t * pDst,
114  ne10_uint32_t blockSize)
115 {
116 
117  ne10_float32_t *pState = S->pState; /* State pointer */
118  ne10_float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
119  ne10_float32_t *pStateCurnt; /* Points to the current sample of the state */
120  ne10_float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
121  ne10_uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
122  ne10_uint32_t i, tapCnt, blkCnt; /* Loop counters */
123 
124  /* Run the below code for Cortex-M4 and Cortex-M3 */
125 
126  ne10_float32_t acc0, acc1, acc2, acc3; /* Accumulators */
127  ne10_float32_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
128 
129 
130  /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
131  /* pStateCurnt points to the location where the new input data should be written */
132  pStateCurnt = & (S->pState[ (numTaps - 1u)]);
133 
134  /* Apply loop unrolling and compute 4 output values simultaneously.
135  * The variables acc0 ... acc3 hold output values that are being computed:
136  *
137  * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
138  * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
139  * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
140  * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
141  */
142  blkCnt = blockSize >> 2;
143 
144  /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
145  ** a second loop below computes the remaining 1 to 3 samples. */
146  while (blkCnt > 0u)
147  {
148  /* Copy four new input samples into the state buffer */
149  *pStateCurnt++ = *pSrc++;
150  *pStateCurnt++ = *pSrc++;
151  *pStateCurnt++ = *pSrc++;
152  *pStateCurnt++ = *pSrc++;
153 
154  /* Set all accumulators to zero */
155  acc0 = 0.0f;
156  acc1 = 0.0f;
157  acc2 = 0.0f;
158  acc3 = 0.0f;
159 
160  /* Initialize state pointer */
161  px = pState;
162 
163  /* Initialize coeff pointer */
164  pb = (pCoeffs);
165 
166  /* Read the first three samples from the state buffer: x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
167  x0 = *px++;
168  x1 = *px++;
169  x2 = *px++;
170 
171  /* Loop unrolling. Process 4 taps at a time. */
172  tapCnt = numTaps >> 2u;
173 
174  /* Loop over the number of taps. Unroll by a factor of 4.
175  ** Repeat until we've computed numTaps-4 coefficients. */
176  while (tapCnt > 0u)
177  {
178  /* Read the b[numTaps-1] coefficient */
179  c0 = * (pb++);
180 
181  /* Read x[n-numTaps-3] sample */
182  x3 = * (px++);
183 
184  /* acc0 += b[numTaps-1] * x[n-numTaps] */
185  acc0 += x0 * c0;
186 
187  /* acc1 += b[numTaps-1] * x[n-numTaps-1] */
188  acc1 += x1 * c0;
189 
190  /* acc2 += b[numTaps-1] * x[n-numTaps-2] */
191  acc2 += x2 * c0;
192 
193  /* acc3 += b[numTaps-1] * x[n-numTaps-3] */
194  acc3 += x3 * c0;
195 
196  /* Read the b[numTaps-2] coefficient */
197  c0 = * (pb++);
198 
199  /* Read x[n-numTaps-4] sample */
200  x0 = * (px++);
201 
202  /* Perform the multiply-accumulate */
203  acc0 += x1 * c0;
204  acc1 += x2 * c0;
205  acc2 += x3 * c0;
206  acc3 += x0 * c0;
207 
208  /* Read the b[numTaps-3] coefficient */
209  c0 = * (pb++);
210 
211  /* Read x[n-numTaps-5] sample */
212  x1 = * (px++);
213 
214  /* Perform the multiply-accumulates */
215  acc0 += x2 * c0;
216  acc1 += x3 * c0;
217  acc2 += x0 * c0;
218  acc3 += x1 * c0;
219 
220  /* Read the b[numTaps-4] coefficient */
221  c0 = * (pb++);
222 
223  /* Read x[n-numTaps-6] sample */
224  x2 = * (px++);
225 
226  /* Perform the multiply-accumulates */
227  acc0 += x3 * c0;
228  acc1 += x0 * c0;
229  acc2 += x1 * c0;
230  acc3 += x2 * c0;
231 
232  tapCnt--;
233  }
234 
235  /* If the filter length is not a multiple of 4, compute the remaining filter taps */
236  tapCnt = numTaps % 0x4u;
237 
238  while (tapCnt > 0u)
239  {
240  /* Read coefficients */
241  c0 = * (pb++);
242 
243  /* Fetch 1 state variable */
244  x3 = * (px++);
245 
246  /* Perform the multiply-accumulates */
247  acc0 += x0 * c0;
248  acc1 += x1 * c0;
249  acc2 += x2 * c0;
250  acc3 += x3 * c0;
251 
252  /* Reuse the present sample states for next sample */
253  x0 = x1;
254  x1 = x2;
255  x2 = x3;
256 
257  /* Decrement the loop counter */
258  tapCnt--;
259  }
260 
261  /* Advance the state pointer by 4 to process the next group of 4 samples */
262  pState = pState + 4;
263 
264  /* The results in the 4 accumulators, store in the destination buffer. */
265  *pDst++ = acc0;
266  *pDst++ = acc1;
267  *pDst++ = acc2;
268  *pDst++ = acc3;
269 
270  blkCnt--;
271  }
272 
273  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
274  ** No loop unrolling is used. */
275  blkCnt = blockSize % 0x4u;
276 
277  while (blkCnt > 0u)
278  {
279  /* Copy one sample at a time into state buffer */
280  *pStateCurnt++ = *pSrc++;
281 
282  /* Set the accumulator to zero */
283  acc0 = 0.0f;
284 
285  /* Initialize state pointer */
286  px = pState;
287 
288  /* Initialize Coefficient pointer */
289  pb = (pCoeffs);
290 
291  i = numTaps;
292 
293  /* Perform the multiply-accumulates */
294  do
295  {
296  acc0 += *px++ * *pb++;
297  i--;
298 
299  }
300  while (i > 0u);
301 
302  /* The result is store in the destination buffer. */
303  *pDst++ = acc0;
304 
305  /* Advance state pointer by 1 for the next sample */
306  pState = pState + 1;
307 
308  blkCnt--;
309  }
310 
311  /* Processing is complete.
312  ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
313  ** This prepares the state buffer for the next function call. */
314 
315  /* Points to the start of the state buffer */
316  pStateCurnt = S->pState;
317 
318  tapCnt = (numTaps - 1u) >> 2u;
319 
320  /* copy data */
321  while (tapCnt > 0u)
322  {
323  *pStateCurnt++ = *pState++;
324  *pStateCurnt++ = *pState++;
325  *pStateCurnt++ = *pState++;
326  *pStateCurnt++ = *pState++;
327 
328  /* Decrement the loop counter */
329  tapCnt--;
330  }
331 
332  /* Calculate remaining number of copies */
333  tapCnt = (numTaps - 1u) % 0x4u;
334 
335  /* Copy the remaining q31_t data */
336  while (tapCnt > 0u)
337  {
338  *pStateCurnt++ = *pState++;
339 
340  /* Decrement the loop counter */
341  tapCnt--;
342  }
343 
344 }
430  ne10_float32_t * pSrc,
431  ne10_float32_t * pDst,
432  ne10_uint32_t blockSize)
433 {
434  ne10_float32_t *pState = S->pState; /* State pointer */
435  ne10_float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
436  ne10_float32_t *pStateCurnt; /* Points to the current sample of the state */
437  ne10_float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
438  ne10_float32_t sum0; /* Accumulator */
439  ne10_float32_t x0, c0; /* Temporary variables to hold state and coefficient values */
440  ne10_uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
441  ne10_uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */
442 
443 
444  /* Run the below code for Cortex-M4 and Cortex-M3 */
445 
446  /* S->pState buffer contains previous frame (numTaps - 1) samples */
447  /* pStateCurnt points to the location where the new input data should be written */
448  pStateCurnt = S->pState + (numTaps - 1u);
449 
450  /* Total number of output samples to be computed */
451  blkCnt = outBlockSize;
452 
453  while (blkCnt > 0u)
454  {
455  /* Copy decimation factor number of new input samples into the state buffer */
456  i = S->M;
457 
458  do
459  {
460  *pStateCurnt++ = *pSrc++;
461 
462  }
463  while (--i);
464 
465  /* Set accumulator to zero */
466  sum0 = 0.0f;
467 
468  /* Initialize state pointer */
469  px = pState;
470 
471  /* Initialize coeff pointer */
472  pb = pCoeffs;
473 
474  /* Loop unrolling. Process 4 taps at a time. */
475  tapCnt = numTaps >> 2;
476 
477  /* Loop over the number of taps. Unroll by a factor of 4.
478  ** Repeat until we've computed numTaps-4 coefficients. */
479  while (tapCnt > 0u)
480  {
481  /* Read the b[numTaps-1] coefficient */
482  c0 = * (pb++);
483 
484  /* Read x[n-numTaps-1] sample */
485  x0 = * (px++);
486 
487  /* Perform the multiply-accumulate */
488  sum0 += x0 * c0;
489 
490  /* Read the b[numTaps-2] coefficient */
491  c0 = * (pb++);
492 
493  /* Read x[n-numTaps-2] sample */
494  x0 = * (px++);
495 
496  /* Perform the multiply-accumulate */
497  sum0 += x0 * c0;
498 
499  /* Read the b[numTaps-3] coefficient */
500  c0 = * (pb++);
501 
502  /* Read x[n-numTaps-3] sample */
503  x0 = * (px++);
504 
505  /* Perform the multiply-accumulate */
506  sum0 += x0 * c0;
507 
508  /* Read the b[numTaps-4] coefficient */
509  c0 = * (pb++);
510 
511  /* Read x[n-numTaps-4] sample */
512  x0 = * (px++);
513 
514  /* Perform the multiply-accumulate */
515  sum0 += x0 * c0;
516 
517  /* Decrement the loop counter */
518  tapCnt--;
519  }
520 
521  /* If the filter length is not a multiple of 4, compute the remaining filter taps */
522  tapCnt = numTaps % 0x4u;
523 
524  while (tapCnt > 0u)
525  {
526  /* Read coefficients */
527  c0 = * (pb++);
528 
529  /* Fetch 1 state variable */
530  x0 = * (px++);
531 
532  /* Perform the multiply-accumulate */
533  sum0 += x0 * c0;
534 
535  /* Decrement the loop counter */
536  tapCnt--;
537  }
538 
539  /* Advance the state pointer by the decimation factor
540  * to process the next group of decimation factor number samples */
541  pState = pState + S->M;
542 
543  /* The result is in the accumulator, store in the destination buffer. */
544  *pDst++ = sum0;
545 
546  /* Decrement the loop counter */
547  blkCnt--;
548  }
549 
550  /* Processing is complete.
551  ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
552  ** This prepares the state buffer for the next function call. */
553 
554  /* Points to the start of the state buffer */
555  pStateCurnt = S->pState;
556 
557  i = (numTaps - 1u) >> 2;
558 
559  /* copy data */
560  while (i > 0u)
561  {
562  *pStateCurnt++ = *pState++;
563  *pStateCurnt++ = *pState++;
564  *pStateCurnt++ = *pState++;
565  *pStateCurnt++ = *pState++;
566 
567  /* Decrement the loop counter */
568  i--;
569  }
570 
571  i = (numTaps - 1u) % 0x04u;
572 
573  /* copy data */
574  while (i > 0u)
575  {
576  *pStateCurnt++ = *pState++;
577 
578  /* Decrement the loop counter */
579  i--;
580  }
581 
582 }
680  ne10_float32_t * pSrc,
681  ne10_float32_t * pDst,
682  ne10_uint32_t blockSize)
683 {
684  ne10_float32_t *pState = S->pState; /* State pointer */
685  ne10_float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
686  ne10_float32_t *pStateCurnt; /* Points to the current sample of the state */
687  ne10_float32_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */
688 
689 
690  /* Run the below code for Cortex-M4 and Cortex-M3 */
691 
692  ne10_float32_t sum0; /* Accumulators */
693  ne10_float32_t x0, c0; /* Temporary variables to hold state and coefficient values */
694  ne10_uint32_t i, blkCnt, j; /* Loop counters */
695  ne10_uint16_t phaseLen = S->phaseLength, tapCnt; /* Length of each polyphase filter component */
696 
697 
698  /* S->pState buffer contains previous frame (phaseLen - 1) samples */
699  /* pStateCurnt points to the location where the new input data should be written */
700  pStateCurnt = S->pState + (phaseLen - 1u);
701 
702  /* Total number of intput samples */
703  blkCnt = blockSize;
704 
705  /* Loop over the blockSize. */
706  while (blkCnt > 0u)
707  {
708  /* Copy new input sample into the state buffer */
709  *pStateCurnt++ = *pSrc++;
710 
711  /* Address modifier index of coefficient buffer */
712  j = 1u;
713 
714  /* Loop over the Interpolation factor. */
715  i = S->L;
716  while (i > 0u)
717  {
718  /* Set accumulator to zero */
719  sum0 = 0.0f;
720 
721  /* Initialize state pointer */
722  ptr1 = pState;
723 
724  /* Initialize coefficient pointer */
725  ptr2 = pCoeffs + (S->L - j);
726 
727  /* Loop over the polyPhase length. Unroll by a factor of 4.
728  ** Repeat until we've computed numTaps-(4*S->L) coefficients. */
729  tapCnt = phaseLen >> 2u;
730  while (tapCnt > 0u)
731  {
732 
733  /* Read the coefficient */
734  c0 = * (ptr2);
735 
736  /* Upsampling is done by stuffing L-1 zeros between each sample.
737  * So instead of multiplying zeros with coefficients,
738  * Increment the coefficient pointer by interpolation factor times. */
739  ptr2 += S->L;
740 
741  /* Read the input sample */
742  x0 = * (ptr1++);
743 
744  /* Perform the multiply-accumulate */
745  sum0 += x0 * c0;
746 
747  /* Read the coefficient */
748  c0 = * (ptr2);
749 
750  /* Increment the coefficient pointer by interpolation factor times. */
751  ptr2 += S->L;
752 
753  /* Read the input sample */
754  x0 = * (ptr1++);
755 
756  /* Perform the multiply-accumulate */
757  sum0 += x0 * c0;
758 
759  /* Read the coefficient */
760  c0 = * (ptr2);
761 
762  /* Increment the coefficient pointer by interpolation factor times. */
763  ptr2 += S->L;
764 
765  /* Read the input sample */
766  x0 = * (ptr1++);
767 
768  /* Perform the multiply-accumulate */
769  sum0 += x0 * c0;
770 
771  /* Read the coefficient */
772  c0 = * (ptr2);
773 
774  /* Increment the coefficient pointer by interpolation factor times. */
775  ptr2 += S->L;
776 
777  /* Read the input sample */
778  x0 = * (ptr1++);
779 
780  /* Perform the multiply-accumulate */
781  sum0 += x0 * c0;
782 
783  /* Decrement the loop counter */
784  tapCnt--;
785  }
786 
787  /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
788  tapCnt = phaseLen % 0x4u;
789 
790  while (tapCnt > 0u)
791  {
792  /* Perform the multiply-accumulate */
793  sum0 += * (ptr1++) * (*ptr2);
794 
795  /* Increment the coefficient pointer by interpolation factor times. */
796  ptr2 += S->L;
797 
798  /* Decrement the loop counter */
799  tapCnt--;
800  }
801 
802  /* The result is in the accumulator, store in the destination buffer. */
803  *pDst++ = sum0;
804 
805  /* Increment the address modifier index of coefficient buffer */
806  j++;
807 
808  /* Decrement the loop counter */
809  i--;
810  }
811 
812  /* Advance the state pointer by 1
813  * to process the next group of interpolation factor number samples */
814  pState = pState + 1;
815 
816  /* Decrement the loop counter */
817  blkCnt--;
818  }
819 
820  /* Processing is complete.
821  ** Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
822  ** This prepares the state buffer for the next function call. */
823 
824  /* Points to the start of the state buffer */
825  pStateCurnt = S->pState;
826 
827  tapCnt = (phaseLen - 1u) >> 2u;
828 
829  /* copy data */
830  while (tapCnt > 0u)
831  {
832  *pStateCurnt++ = *pState++;
833  *pStateCurnt++ = *pState++;
834  *pStateCurnt++ = *pState++;
835  *pStateCurnt++ = *pState++;
836 
837  /* Decrement the loop counter */
838  tapCnt--;
839  }
840 
841  tapCnt = (phaseLen - 1u) % 0x04u;
842 
843  while (tapCnt > 0u)
844  {
845  *pStateCurnt++ = *pState++;
846 
847  /* Decrement the loop counter */
848  tapCnt--;
849  }
850 
851 }
930  ne10_float32_t * pSrc,
931  ne10_float32_t * pDst,
932  ne10_uint32_t blockSize)
933 {
934  ne10_float32_t *pState; /* State pointer */
935  ne10_float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
936  ne10_float32_t *px; /* temporary state pointer */
937  ne10_float32_t *pk; /* temporary coefficient pointer */
938 
939 
940  /* Run the below code for Cortex-M4 and Cortex-M3 */
941 
942  ne10_float32_t fcurr1, fnext1, gcurr1, gnext1; /* temporary variables for first sample in loop unrolling */
943  ne10_float32_t fcurr2, fnext2, gnext2; /* temporary variables for second sample in loop unrolling */
944  ne10_float32_t fcurr3, fnext3, gnext3; /* temporary variables for third sample in loop unrolling */
945  ne10_float32_t fcurr4, fnext4, gnext4; /* temporary variables for fourth sample in loop unrolling */
946  ne10_uint32_t numStages = S->numStages; /* Number of stages in the filter */
947  ne10_uint32_t blkCnt, stageCnt; /* temporary variables for counts */
948 
949  gcurr1 = 0.0f;
950  pState = &S->pState[0];
951 
952  blkCnt = blockSize >> 2;
953 
954  /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
955  a second loop below computes the remaining 1 to 3 samples. */
956  while (blkCnt > 0u)
957  {
958 
959  /* Read two samples from input buffer */
960  /* f0(n) = x(n) */
961  fcurr1 = *pSrc++;
962  fcurr2 = *pSrc++;
963 
964  /* Initialize coeff pointer */
965  pk = (pCoeffs);
966 
967  /* Initialize state pointer */
968  px = pState;
969 
970  /* Read g0(n-1) from state */
971  gcurr1 = *px;
972 
973  /* Process first sample for first tap */
974  /* f1(n) = f0(n) + K1 * g0(n-1) */
975  fnext1 = fcurr1 + ( (*pk) * gcurr1);
976  /* g1(n) = f0(n) * K1 + g0(n-1) */
977  gnext1 = (fcurr1 * (*pk)) + gcurr1;
978 
979  /* Process second sample for first tap */
980  /* for sample 2 processing */
981  fnext2 = fcurr2 + ( (*pk) * fcurr1);
982  gnext2 = (fcurr2 * (*pk)) + fcurr1;
983 
984  /* Read next two samples from input buffer */
985  /* f0(n+2) = x(n+2) */
986  fcurr3 = *pSrc++;
987  fcurr4 = *pSrc++;
988 
989  /* Copy only last input samples into the state buffer
990  which will be used for next four samples processing */
991  *px++ = fcurr4;
992 
993  /* Process third sample for first tap */
994  fnext3 = fcurr3 + ( (*pk) * fcurr2);
995  gnext3 = (fcurr3 * (*pk)) + fcurr2;
996 
997  /* Process fourth sample for first tap */
998  fnext4 = fcurr4 + ( (*pk) * fcurr3);
999  gnext4 = (fcurr4 * (*pk++)) + fcurr3;
1000 
1001  /* Update of f values for next coefficient set processing */
1002  fcurr1 = fnext1;
1003  fcurr2 = fnext2;
1004  fcurr3 = fnext3;
1005  fcurr4 = fnext4;
1006 
1007  /* Loop unrolling. Process 4 taps at a time . */
1008  stageCnt = (numStages - 1u) >> 2u;
1009 
1010  /* Loop over the number of taps. Unroll by a factor of 4.
1011  ** Repeat until we've computed numStages-3 coefficients. */
1012 
1013  /* Process 2nd, 3rd, 4th and 5th taps ... here */
1014  while (stageCnt > 0u)
1015  {
1016  /* Read g1(n-1), g3(n-1) .... from state */
1017  gcurr1 = *px;
1018 
1019  /* save g1(n) in state buffer */
1020  *px++ = gnext4;
1021 
1022  /* Process first sample for 2nd, 6th .. tap */
1023  /* Sample processing for K2, K6.... */
1024  /* f2(n) = f1(n) + K2 * g1(n-1) */
1025  fnext1 = fcurr1 + ( (*pk) * gcurr1);
1026  /* Process second sample for 2nd, 6th .. tap */
1027  /* for sample 2 processing */
1028  fnext2 = fcurr2 + ( (*pk) * gnext1);
1029  /* Process third sample for 2nd, 6th .. tap */
1030  fnext3 = fcurr3 + ( (*pk) * gnext2);
1031  /* Process fourth sample for 2nd, 6th .. tap */
1032  fnext4 = fcurr4 + ( (*pk) * gnext3);
1033 
1034  /* g2(n) = f1(n) * K2 + g1(n-1) */
1035  /* Calculation of state values for next stage */
1036  gnext4 = (fcurr4 * (*pk)) + gnext3;
1037  gnext3 = (fcurr3 * (*pk)) + gnext2;
1038  gnext2 = (fcurr2 * (*pk)) + gnext1;
1039  gnext1 = (fcurr1 * (*pk++)) + gcurr1;
1040 
1041 
1042  /* Read g2(n-1), g4(n-1) .... from state */
1043  gcurr1 = *px;
1044 
1045  /* save g2(n) in state buffer */
1046  *px++ = gnext4;
1047 
1048  /* Sample processing for K3, K7.... */
1049  /* Process first sample for 3rd, 7th .. tap */
1050  /* f3(n) = f2(n) + K3 * g2(n-1) */
1051  fcurr1 = fnext1 + ( (*pk) * gcurr1);
1052  /* Process second sample for 3rd, 7th .. tap */
1053  fcurr2 = fnext2 + ( (*pk) * gnext1);
1054  /* Process third sample for 3rd, 7th .. tap */
1055  fcurr3 = fnext3 + ( (*pk) * gnext2);
1056  /* Process fourth sample for 3rd, 7th .. tap */
1057  fcurr4 = fnext4 + ( (*pk) * gnext3);
1058 
1059  /* Calculation of state values for next stage */
1060  /* g3(n) = f2(n) * K3 + g2(n-1) */
1061  gnext4 = (fnext4 * (*pk)) + gnext3;
1062  gnext3 = (fnext3 * (*pk)) + gnext2;
1063  gnext2 = (fnext2 * (*pk)) + gnext1;
1064  gnext1 = (fnext1 * (*pk++)) + gcurr1;
1065 
1066 
1067  /* Read g1(n-1), g3(n-1) .... from state */
1068  gcurr1 = *px;
1069 
1070  /* save g3(n) in state buffer */
1071  *px++ = gnext4;
1072 
1073  /* Sample processing for K4, K8.... */
1074  /* Process first sample for 4th, 8th .. tap */
1075  /* f4(n) = f3(n) + K4 * g3(n-1) */
1076  fnext1 = fcurr1 + ( (*pk) * gcurr1);
1077  /* Process second sample for 4th, 8th .. tap */
1078  /* for sample 2 processing */
1079  fnext2 = fcurr2 + ( (*pk) * gnext1);
1080  /* Process third sample for 4th, 8th .. tap */
1081  fnext3 = fcurr3 + ( (*pk) * gnext2);
1082  /* Process fourth sample for 4th, 8th .. tap */
1083  fnext4 = fcurr4 + ( (*pk) * gnext3);
1084 
1085  /* g4(n) = f3(n) * K4 + g3(n-1) */
1086  /* Calculation of state values for next stage */
1087  gnext4 = (fcurr4 * (*pk)) + gnext3;
1088  gnext3 = (fcurr3 * (*pk)) + gnext2;
1089  gnext2 = (fcurr2 * (*pk)) + gnext1;
1090  gnext1 = (fcurr1 * (*pk++)) + gcurr1;
1091 
1092  /* Read g2(n-1), g4(n-1) .... from state */
1093  gcurr1 = *px;
1094 
1095  /* save g4(n) in state buffer */
1096  *px++ = gnext4;
1097 
1098  /* Sample processing for K5, K9.... */
1099  /* Process first sample for 5th, 9th .. tap */
1100  /* f5(n) = f4(n) + K5 * g4(n-1) */
1101  fcurr1 = fnext1 + ( (*pk) * gcurr1);
1102  /* Process second sample for 5th, 9th .. tap */
1103  fcurr2 = fnext2 + ( (*pk) * gnext1);
1104  /* Process third sample for 5th, 9th .. tap */
1105  fcurr3 = fnext3 + ( (*pk) * gnext2);
1106  /* Process fourth sample for 5th, 9th .. tap */
1107  fcurr4 = fnext4 + ( (*pk) * gnext3);
1108 
1109  /* Calculation of state values for next stage */
1110  /* g5(n) = f4(n) * K5 + g4(n-1) */
1111  gnext4 = (fnext4 * (*pk)) + gnext3;
1112  gnext3 = (fnext3 * (*pk)) + gnext2;
1113  gnext2 = (fnext2 * (*pk)) + gnext1;
1114  gnext1 = (fnext1 * (*pk++)) + gcurr1;
1115 
1116  stageCnt--;
1117  }
1118 
1119  /* If the (filter length -1) is not a multiple of 4, compute the remaining filter taps */
1120  stageCnt = (numStages - 1u) % 0x4u;
1121 
1122  while (stageCnt > 0u)
1123  {
1124  gcurr1 = *px;
1125 
1126  /* save g value in state buffer */
1127  *px++ = gnext4;
1128 
1129  /* Process four samples for last three taps here */
1130  fnext1 = fcurr1 + ( (*pk) * gcurr1);
1131  fnext2 = fcurr2 + ( (*pk) * gnext1);
1132  fnext3 = fcurr3 + ( (*pk) * gnext2);
1133  fnext4 = fcurr4 + ( (*pk) * gnext3);
1134 
1135  /* g1(n) = f0(n) * K1 + g0(n-1) */
1136  gnext4 = (fcurr4 * (*pk)) + gnext3;
1137  gnext3 = (fcurr3 * (*pk)) + gnext2;
1138  gnext2 = (fcurr2 * (*pk)) + gnext1;
1139  gnext1 = (fcurr1 * (*pk++)) + gcurr1;
1140 
1141  /* Update of f values for next coefficient set processing */
1142  fcurr1 = fnext1;
1143  fcurr2 = fnext2;
1144  fcurr3 = fnext3;
1145  fcurr4 = fnext4;
1146 
1147  stageCnt--;
1148 
1149  }
1150 
1151  /* The results in the 4 accumulators, store in the destination buffer. */
1152  /* y(n) = fN(n) */
1153  *pDst++ = fcurr1;
1154  *pDst++ = fcurr2;
1155  *pDst++ = fcurr3;
1156  *pDst++ = fcurr4;
1157 
1158  blkCnt--;
1159  }
1160 
1161  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
1162  ** No loop unrolling is used. */
1163  blkCnt = blockSize % 0x4u;
1164 
1165  while (blkCnt > 0u)
1166  {
1167  /* f0(n) = x(n) */
1168  fcurr1 = *pSrc++;
1169 
1170  /* Initialize coeff pointer */
1171  pk = (pCoeffs);
1172 
1173  /* Initialize state pointer */
1174  px = pState;
1175 
1176  /* read g2(n) from state buffer */
1177  gcurr1 = *px;
1178 
1179  /* for sample 1 processing */
1180  /* f1(n) = f0(n) + K1 * g0(n-1) */
1181  fnext1 = fcurr1 + ( (*pk) * gcurr1);
1182  /* g1(n) = f0(n) * K1 + g0(n-1) */
1183  gnext1 = (fcurr1 * (*pk++)) + gcurr1;
1184 
1185  /* save g1(n) in state buffer */
1186  *px++ = fcurr1;
1187 
1188  /* f1(n) is saved in fcurr1
1189  for next stage processing */
1190  fcurr1 = fnext1;
1191 
1192  stageCnt = (numStages - 1u);
1193 
1194  /* stage loop */
1195  while (stageCnt > 0u)
1196  {
1197  /* read g2(n) from state buffer */
1198  gcurr1 = *px;
1199 
1200  /* save g1(n) in state buffer */
1201  *px++ = gnext1;
1202 
1203  /* Sample processing for K2, K3.... */
1204  /* f2(n) = f1(n) + K2 * g1(n-1) */
1205  fnext1 = fcurr1 + ( (*pk) * gcurr1);
1206  /* g2(n) = f1(n) * K2 + g1(n-1) */
1207  gnext1 = (fcurr1 * (*pk++)) + gcurr1;
1208 
1209  /* f1(n) is saved in fcurr1
1210  for next stage processing */
1211  fcurr1 = fnext1;
1212 
1213  stageCnt--;
1214 
1215  }
1216 
1217  /* y(n) = fN(n) */
1218  *pDst++ = fcurr1;
1219 
1220  blkCnt--;
1221 
1222  }
1223 
1224 }
1230 static void ne10_circular_write_float (ne10_int32_t * circBuffer,
1231  ne10_int32_t L,
1232  ne10_uint16_t * writeOffset,
1233  ne10_int32_t bufferInc,
1234  const ne10_int32_t * src,
1235  ne10_int32_t srcInc,
1236  ne10_uint32_t blockSize)
1237 {
1238  ne10_uint32_t i = 0u;
1239  ne10_int32_t wOffset;
1240 
1241  /* Copy the value of Index pointer that points
1242  * to the current location where the input samples to be copied */
1243  wOffset = *writeOffset;
1244 
1245  /* Loop over the blockSize */
1246  i = blockSize;
1247 
1248  while (i > 0u)
1249  {
1250  /* copy the input sample to the circular buffer */
1251  circBuffer[wOffset] = *src;
1252 
1253  /* Update the input pointer */
1254  src += srcInc;
1255 
1256  /* Circularly update wOffset. Watch out for positive and negative value */
1257  wOffset += bufferInc;
1258  if (wOffset >= L)
1259  wOffset -= L;
1260 
1261  /* Decrement the loop counter */
1262  i--;
1263  }
1264 
1265  /* Update the index pointer */
1266  *writeOffset = wOffset;
1267 }
1268 
1269 
1270 
1274 static void ne10_circular_read_float (ne10_int32_t * circBuffer,
1275  ne10_int32_t L,
1276  ne10_int32_t * readOffset,
1277  ne10_int32_t bufferInc,
1278  ne10_int32_t * dst,
1279  ne10_int32_t * dst_base,
1280  ne10_int32_t dst_length,
1281  ne10_int32_t dstInc,
1282  ne10_uint32_t blockSize)
1283 {
1284  ne10_uint32_t i = 0u;
1285  ne10_int32_t rOffset, *dst_end;
1286 
1287  /* Copy the value of Index pointer that points
1288  * to the current location from where the input samples to be read */
1289  rOffset = *readOffset;
1290  dst_end = dst_base + dst_length;
1291 
1292  /* Loop over the blockSize */
1293  i = blockSize;
1294 
1295  while (i > 0u)
1296  {
1297  /* copy the sample from the circular buffer to the destination buffer */
1298  *dst = circBuffer[rOffset];
1299 
1300  /* Update the input pointer */
1301  dst += dstInc;
1302 
1303  if (dst == dst_end)
1304  {
1305  dst = dst_base;
1306  }
1307 
1308  /* Circularly update rOffset. Watch out for positive and negative value */
1309  rOffset += bufferInc;
1310 
1311  if (rOffset >= L)
1312  {
1313  rOffset -= L;
1314  }
1315 
1316  /* Decrement the loop counter */
1317  i--;
1318  }
1319 
1320  /* Update the index pointer */
1321  *readOffset = rOffset;
1322 }
1323 
1387  ne10_float32_t * pSrc,
1388  ne10_float32_t * pDst,
1389  ne10_float32_t * pScratchIn,
1390  ne10_uint32_t blockSize)
1391 {
1392 
1393  ne10_float32_t *pState = S->pState; /* State pointer */
1394  ne10_float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
1395  ne10_float32_t *px; /* Scratch buffer pointer */
1396  ne10_float32_t *py = pState; /* Temporary pointers for state buffer */
1397  ne10_float32_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
1398  ne10_float32_t *pOut; /* Destination pointer */
1399  ne10_int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
1400  ne10_uint32_t delaySize = S->maxDelay + blockSize; /* state length */
1401  ne10_uint16_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
1402  ne10_int32_t readIndex; /* Read index of the state buffer */
1403  ne10_uint32_t tapCnt, blkCnt; /* loop counters */
1404  ne10_float32_t coeff = *pCoeffs++; /* Read the first coefficient value */
1405 
1406 
1407 
1408  /* BlockSize of Input samples are copied into the state buffer */
1409  /* StateIndex points to the starting position to write in the state buffer */
1410  ne10_circular_write_float ( (ne10_int32_t *) py, delaySize, &S->stateIndex, 1,
1411  (ne10_int32_t *) pSrc, 1, blockSize);
1412 
1413 
1414  /* Read Index, from where the state buffer should be read, is calculated. */
1415  readIndex = ( (ne10_int32_t) S->stateIndex - (ne10_int32_t) blockSize) - *pTapDelay++;
1416 
1417  /* Wraparound of readIndex */
1418  if (readIndex < 0)
1419  {
1420  readIndex += (ne10_int32_t) delaySize;
1421  }
1422 
1423  /* Working pointer for state buffer is updated */
1424  py = pState;
1425 
1426  /* blockSize samples are read from the state buffer */
1427  ne10_circular_read_float ( (ne10_int32_t *) py, delaySize, &readIndex, 1,
1428  (ne10_int32_t *) pb, (ne10_int32_t *) pb, blockSize, 1,
1429  blockSize);
1430 
1431  /* Working pointer for the scratch buffer */
1432  px = pb;
1433 
1434  /* Working pointer for destination buffer */
1435  pOut = pDst;
1436 
1437 
1438  /* Run the below code for Cortex-M4 and Cortex-M3 */
1439 
1440  /* Loop over the blockSize. Unroll by a factor of 4.
1441  * Compute 4 Multiplications at a time. */
1442  blkCnt = blockSize >> 2u;
1443 
1444  while (blkCnt > 0u)
1445  {
1446  /* Perform Multiplications and store in destination buffer */
1447  *pOut++ = *px++ * coeff;
1448  *pOut++ = *px++ * coeff;
1449  *pOut++ = *px++ * coeff;
1450  *pOut++ = *px++ * coeff;
1451 
1452  /* Decrement the loop counter */
1453  blkCnt--;
1454  }
1455 
1456  /* If the blockSize is not a multiple of 4,
1457  * compute the remaining samples */
1458  blkCnt = blockSize % 0x4u;
1459 
1460  while (blkCnt > 0u)
1461  {
1462  /* Perform Multiplications and store in destination buffer */
1463  *pOut++ = *px++ * coeff;
1464 
1465  /* Decrement the loop counter */
1466  blkCnt--;
1467  }
1468 
1469  /* Load the coefficient value and
1470  * increment the coefficient buffer for the next set of state values */
1471  coeff = *pCoeffs++;
1472 
1473  /* Read Index, from where the state buffer should be read, is calculated. */
1474  readIndex = ( (ne10_int32_t) S->stateIndex - (ne10_int32_t) blockSize) - *pTapDelay++;
1475 
1476  /* Wraparound of readIndex */
1477  if (readIndex < 0)
1478  {
1479  readIndex += (ne10_int32_t) delaySize;
1480  }
1481 
1482  /* Loop over the number of taps. */
1483  tapCnt = (ne10_uint32_t) numTaps - 1u;
1484 
1485  while (tapCnt > 0u)
1486  {
1487 
1488  /* Working pointer for state buffer is updated */
1489  py = pState;
1490 
1491  /* blockSize samples are read from the state buffer */
1492  ne10_circular_read_float ( (ne10_int32_t *) py, delaySize, &readIndex, 1,
1493  (ne10_int32_t *) pb, (ne10_int32_t *) pb, blockSize, 1,
1494  blockSize);
1495 
1496  /* Working pointer for the scratch buffer */
1497  px = pb;
1498 
1499  /* Working pointer for destination buffer */
1500  pOut = pDst;
1501 
1502  /* Loop over the blockSize. Unroll by a factor of 4.
1503  * Compute 4 MACS at a time. */
1504  blkCnt = blockSize >> 2u;
1505 
1506  while (blkCnt > 0u)
1507  {
1508  /* Perform Multiply-Accumulate */
1509  *pOut++ += *px++ * coeff;
1510  *pOut++ += *px++ * coeff;
1511  *pOut++ += *px++ * coeff;
1512  *pOut++ += *px++ * coeff;
1513 
1514  /* Decrement the loop counter */
1515  blkCnt--;
1516  }
1517 
1518  /* If the blockSize is not a multiple of 4,
1519  * compute the remaining samples */
1520  blkCnt = blockSize % 0x4u;
1521 
1522  while (blkCnt > 0u)
1523  {
1524  /* Perform Multiply-Accumulate */
1525  *pOut++ += *px++ * coeff;
1526 
1527  /* Decrement the loop counter */
1528  blkCnt--;
1529  }
1530 
1531  /* Load the coefficient value and
1532  * increment the coefficient buffer for the next set of state values */
1533  coeff = *pCoeffs++;
1534 
1535  /* Read Index, from where the state buffer should be read, is calculated. */
1536  readIndex = ( (ne10_int32_t) S->stateIndex -
1537  (ne10_int32_t) blockSize) - *pTapDelay++;
1538 
1539  /* Wraparound of readIndex */
1540  if (readIndex < 0)
1541  {
1542  readIndex += (ne10_int32_t) delaySize;
1543  }
1544 
1545  /* Decrement the tap loop counter */
1546  tapCnt--;
1547  }
1548 
1549 } //end of FIR_sparse group
ne10_float32_t * pCoeffs
Points to the coefficient array.
Definition: NE10_types.h:385
Instance structure for the floating-point FIR Sparse filter.
Definition: NE10_types.h:403
int32_t ne10_int32_t
Definition: NE10_types.h:76
ne10_uint16_t numTaps
Length of the filter.
Definition: NE10_types.h:384
ne10_float32_t * pCoeffs
Points to the coefficient array.
Definition: NE10_types.h:375
ne10_uint16_t phaseLength
Length of each polyphase filter component.
Definition: NE10_types.h:395
float ne10_float32_t
Definition: NE10_types.h:80
ne10_float32_t * pState
Points to the state variable array.
Definition: NE10_types.h:407
void ne10_fir_decimate_float_c(const ne10_fir_decimate_instance_f32_t *S, ne10_float32_t *pSrc, ne10_float32_t *pDst, ne10_uint32_t blockSize)
Specific implementation of ne10_fir_decimate_float using plain C.
Definition: NE10_fir.c:429
uint16_t ne10_uint16_t
Definition: NE10_types.h:75
ne10_uint16_t numTaps
Length of the filter.
Definition: NE10_types.h:363
uint32_t ne10_uint32_t
Definition: NE10_types.h:77
Instance structure for the floating-point FIR Interpolation.
Definition: NE10_types.h:392
ne10_float32_t * pState
Points to the state variable array.
Definition: NE10_types.h:374
Instance structure for the floating-point FIR filter.
Definition: NE10_types.h:361
ne10_uint8_t L
Interpolation Factor.
Definition: NE10_types.h:394
ne10_float32_t * pState
Points to the state variable array.
Definition: NE10_types.h:386
ne10_float32_t * pCoeffs
Points to the coefficient array.
Definition: NE10_types.h:396
ne10_float32_t * pCoeffs
Points to the coefficient array.
Definition: NE10_types.h:365
ne10_float32_t * pState
Points to the state variable array.
Definition: NE10_types.h:364
Instance structure for the floating point FIR Lattice filter.
Definition: NE10_types.h:371
void ne10_fir_float_c(const ne10_fir_instance_f32_t *S, ne10_float32_t *pSrc, ne10_float32_t *pDst, ne10_uint32_t blockSize)
Specific implementation of ne10_fir_float using plain C.
Definition: NE10_fir.c:111
ne10_float32_t * pCoeffs
Points to the coefficient array.
Definition: NE10_types.h:408
Instance structure for the floating-point FIR Decimation.
Definition: NE10_types.h:381
void ne10_fir_lattice_float_c(const ne10_fir_lattice_instance_f32_t *S, ne10_float32_t *pSrc, ne10_float32_t *pDst, ne10_uint32_t blockSize)
Specific implementation of ne10_fir_lattice_float using plain C.
Definition: NE10_fir.c:929
ne10_uint16_t numTaps
Length of the filter.
Definition: NE10_types.h:405
void ne10_fir_interpolate_float_c(const ne10_fir_interpolate_instance_f32_t *S, ne10_float32_t *pSrc, ne10_float32_t *pDst, ne10_uint32_t blockSize)
Specific implementation of ne10_fir_interpolate_float using plain C.
Definition: NE10_fir.c:679
ne10_uint16_t numStages
numStages of the of lattice filter.
Definition: NE10_types.h:373
ne10_uint16_t maxDelay
the largest number of delay line values .
Definition: NE10_types.h:409
ne10_float32_t * pState
Points to the state variable array.
Definition: NE10_types.h:397
ne10_int32_t * pTapDelay
Pointer to the array containing positions of the non-zero tap values.
Definition: NE10_types.h:410
ne10_uint8_t M
Decimation Factor.
Definition: NE10_types.h:383
ne10_uint16_t stateIndex
Index pointer for the state buffer .
Definition: NE10_types.h:406
void ne10_fir_sparse_float_c(ne10_fir_sparse_instance_f32_t *S, ne10_float32_t *pSrc, ne10_float32_t *pDst, ne10_float32_t *pScratchIn, ne10_uint32_t blockSize)
Specific implementation of ne10_fir_sparse_float using plain C.
Definition: NE10_fir.c:1386