tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_fir_q15.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        17. January 2013
   5 * $Revision:    V1.4.1
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:        arm_fir_q15.c
   9 *
  10 * Description:  Q15 FIR filter processing function.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  13 *
  14 * Redistribution and use in source and binary forms, with or without
  15 * modification, are permitted provided that the following conditions
  16 * are met:
  17 *   - Redistributions of source code must retain the above copyright
  18 *     notice, this list of conditions and the following disclaimer.
  19 *   - Redistributions in binary form must reproduce the above copyright
  20 *     notice, this list of conditions and the following disclaimer in
  21 *     the documentation and/or other materials provided with the
  22 *     distribution.
  23 *   - Neither the name of ARM LIMITED nor the names of its contributors
  24 *     may be used to endorse or promote products derived from this
  25 *     software without specific prior written permission.
  26 *
  27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38 * POSSIBILITY OF SUCH DAMAGE.
  39 * -------------------------------------------------------------------- */
  40
  41 #include "arm_math.h"
  42
  43 /**
  44  * @ingroup groupFilters
  45  */
  46
  47 /**
  48  * @addtogroup FIR
  49  * @{
  50  */
  51
  52 /**
  53  * @brief Processing function for the Q15 FIR filter.
  54  * @param[in] *S points to an instance of the Q15 FIR structure.
  55  * @param[in] *pSrc points to the block of input data.
  56  * @param[out] *pDst points to the block of output data.
  57  * @param[in]  blockSize number of samples to process per call.
  58  * @return none.
  59  *
  60  *
  61  * \par Restrictions
  62  *  If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
  63  *      In this case input, output, state buffers should be aligned by 32-bit
  64  *
  65  * <b>Scaling and Overflow Behavior:</b>
  66  * \par
  67  * The function is implemented using a 64-bit internal accumulator.
  68  * Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
  69  * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
  70  * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
  71  * After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
  72  * Lastly, the accumulator is saturated to yield a result in 1.15 format.
  73  *
  74  * \par
  75  * Refer to the function <code>arm_fir_fast_q15()</code> for a faster but less precise implementation of this function.
  76  */
  77
  78 #ifndef ARM_MATH_CM0_FAMILY
  79
  80 /* Run the below code for Cortex-M4 and Cortex-M3 */
  81
  82 #ifndef UNALIGNED_SUPPORT_DISABLE
  83
  84
  85 void arm_fir_q15(
  86   const arm_fir_instance_q15 * S,
  87   q15_t * pSrc,
  88   q15_t * pDst,
  89   uint32_t blockSize)
  90 {
  91   q15_t *pState = S->pState;                     /* State pointer */
  92   q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
  93   q15_t *pStateCurnt;                            /* Points to the current sample of the state */
  94   q15_t *px1;                                    /* Temporary q15 pointer for state buffer */
  95   q15_t *pb;                                     /* Temporary pointer for coefficient buffer */
  96   q31_t x0, x1, x2, x3, c0;                      /* Temporary variables to hold SIMD state and coefficient values */
  97   q63_t acc0, acc1, acc2, acc3;                  /* Accumulators */
  98   uint32_t numTaps = S->numTaps;                 /* Number of taps in the filter */
  99   uint32_t tapCnt, blkCnt;                       /* Loop counters */
 100
 101
 102   /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
 103   /* pStateCurnt points to the location where the new input data should be written */
 104   pStateCurnt = &(S->pState[(numTaps - 1u)]);
 105
 106   /* Apply loop unrolling and compute 4 output values simultaneously.
 107    * The variables acc0 ... acc3 hold output values that are being computed:
 108    *
 109    *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
 110    *    acc1 =  b[numTaps-1] * x[n-numTaps] +   b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
 111    *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] +   b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
 112    *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]
 113    */
 114
 115   blkCnt = blockSize >> 2;
 116
 117   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
 118    ** a second loop below computes the remaining 1 to 3 samples. */
 119   while(blkCnt > 0u)
 120   {
 121     /* Copy four new input samples into the state buffer.
 122      ** Use 32-bit SIMD to move the 16-bit data.  Only requires two copies. */
 123     *__SIMD32(pStateCurnt)++ = *__SIMD32(pSrc)++;
 124     *__SIMD32(pStateCurnt)++ = *__SIMD32(pSrc)++;
 125
 126     /* Set all accumulators to zero */
 127     acc0 = 0;
 128     acc1 = 0;
 129     acc2 = 0;
 130     acc3 = 0;
 131
 132     /* Initialize state pointer of type q15 */
 133     px1 = pState;
 134
 135     /* Initialize coeff pointer of type q31 */
 136     pb = pCoeffs;
 137
 138     /* Read the first two samples from the state buffer:  x[n-N], x[n-N-1] */
 139     x0 = _SIMD32_OFFSET(px1);
 140
 141     /* Read the third and forth samples from the state buffer: x[n-N-1], x[n-N-2] */
 142     x1 = _SIMD32_OFFSET(px1 + 1u);
 143
 144     px1 += 2u;
 145
 146     /* Loop over the number of taps.  Unroll by a factor of 4.
 147      ** Repeat until we've computed numTaps-4 coefficients. */
 148     tapCnt = numTaps >> 2;
 149
 150     while(tapCnt > 0u)
 151     {
 152       /* Read the first two coefficients using SIMD:  b[N] and b[N-1] coefficients */
 153       c0 = *__SIMD32(pb)++;
 154
 155       /* acc0 +=  b[N] * x[n-N] + b[N-1] * x[n-N-1] */
 156       acc0 = __SMLALD(x0, c0, acc0);
 157
 158       /* acc1 +=  b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */
 159       acc1 = __SMLALD(x1, c0, acc1);
 160
 161       /* Read state x[n-N-2], x[n-N-3] */
 162       x2 = _SIMD32_OFFSET(px1);
 163
 164       /* Read state x[n-N-3], x[n-N-4] */
 165       x3 = _SIMD32_OFFSET(px1 + 1u);
 166
 167       /* acc2 +=  b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */
 168       acc2 = __SMLALD(x2, c0, acc2);
 169
 170       /* acc3 +=  b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */
 171       acc3 = __SMLALD(x3, c0, acc3);
 172
 173       /* Read coefficients b[N-2], b[N-3] */
 174       c0 = *__SIMD32(pb)++;
 175
 176       /* acc0 +=  b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */
 177       acc0 = __SMLALD(x2, c0, acc0);
 178
 179       /* acc1 +=  b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */
 180       acc1 = __SMLALD(x3, c0, acc1);
 181
 182       /* Read state x[n-N-4], x[n-N-5] */
 183       x0 = _SIMD32_OFFSET(px1 + 2u);
 184
 185       /* Read state x[n-N-5], x[n-N-6] */
 186       x1 = _SIMD32_OFFSET(px1 + 3u);
 187
 188       /* acc2 +=  b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */
 189       acc2 = __SMLALD(x0, c0, acc2);
 190
 191       /* acc3 +=  b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */
 192       acc3 = __SMLALD(x1, c0, acc3);
 193
 194       px1 += 4u;
 195
 196       tapCnt--;
 197
 198     }
 199
 200
 201     /* If the filter length is not a multiple of 4, compute the remaining filter taps.
 202      ** This is always be 2 taps since the filter length is even. */
 203     if((numTaps & 0x3u) != 0u)
 204     {
 205       /* Read 2 coefficients */
 206       c0 = *__SIMD32(pb)++;
 207
 208       /* Fetch 4 state variables */
 209       x2 = _SIMD32_OFFSET(px1);
 210
 211       x3 = _SIMD32_OFFSET(px1 + 1u);
 212
 213       /* Perform the multiply-accumulates */
 214       acc0 = __SMLALD(x0, c0, acc0);
 215
 216       px1 += 2u;
 217
 218       acc1 = __SMLALD(x1, c0, acc1);
 219       acc2 = __SMLALD(x2, c0, acc2);
 220       acc3 = __SMLALD(x3, c0, acc3);
 221     }
 222
 223     /* The results in the 4 accumulators are in 2.30 format.  Convert to 1.15 with saturation.
 224      ** Then store the 4 outputs in the destination buffer. */
 225
 226 #ifndef ARM_MATH_BIG_ENDIAN
 227
 228     *__SIMD32(pDst)++ =
 229       __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
 230     *__SIMD32(pDst)++ =
 231       __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
 232
 233 #else
 234
 235     *__SIMD32(pDst)++ =
 236       __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
 237     *__SIMD32(pDst)++ =
 238       __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
 239
 240 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN       */
 241
 242
 243
 244     /* Advance the state pointer by 4 to process the next group of 4 samples */
 245     pState = pState + 4;
 246
 247     /* Decrement the loop counter */
 248     blkCnt--;
 249   }
 250
 251   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
 252    ** No loop unrolling is used. */
 253   blkCnt = blockSize % 0x4u;
 254   while(blkCnt > 0u)
 255   {
 256     /* Copy two samples into state buffer */
 257     *pStateCurnt++ = *pSrc++;
 258
 259     /* Set the accumulator to zero */
 260     acc0 = 0;
 261
 262     /* Initialize state pointer of type q15 */
 263     px1 = pState;
 264
 265     /* Initialize coeff pointer of type q31 */
 266     pb = pCoeffs;
 267
 268     tapCnt = numTaps >> 1;
 269
 270     do
 271     {
 272
 273       c0 = *__SIMD32(pb)++;
 274       x0 = *__SIMD32(px1)++;
 275
 276       acc0 = __SMLALD(x0, c0, acc0);
 277       tapCnt--;
 278     }
 279     while(tapCnt > 0u);
 280
 281     /* The result is in 2.30 format.  Convert to 1.15 with saturation.
 282      ** Then store the output in the destination buffer. */
 283     *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
 284
 285     /* Advance state pointer by 1 for the next sample */
 286     pState = pState + 1;
 287
 288     /* Decrement the loop counter */
 289     blkCnt--;
 290   }
 291
 292   /* Processing is complete.
 293    ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
 294    ** This prepares the state buffer for the next function call. */
 295
 296   /* Points to the start of the state buffer */
 297   pStateCurnt = S->pState;
 298
 299   /* Calculation of count for copying integer writes */
 300   tapCnt = (numTaps - 1u) >> 2;
 301
 302   while(tapCnt > 0u)
 303   {
 304
 305     /* Copy state values to start of state buffer */
 306     *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
 307     *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
 308
 309     tapCnt--;
 310
 311   }
 312
 313   /* Calculation of count for remaining q15_t data */
 314   tapCnt = (numTaps - 1u) % 0x4u;
 315
 316   /* copy remaining data */
 317   while(tapCnt > 0u)
 318   {
 319     *pStateCurnt++ = *pState++;
 320
 321     /* Decrement the loop counter */
 322     tapCnt--;
 323   }
 324 }
 325
 326 #else /* UNALIGNED_SUPPORT_DISABLE */
 327
 328 void arm_fir_q15(
 329   const arm_fir_instance_q15 * S,
 330   q15_t * pSrc,
 331   q15_t * pDst,
 332   uint32_t blockSize)
 333 {
 334   q15_t *pState = S->pState;                     /* State pointer */
 335   q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
 336   q15_t *pStateCurnt;                            /* Points to the current sample of the state */
 337   q63_t acc0, acc1, acc2, acc3;                  /* Accumulators */
 338   q15_t *pb;                                     /* Temporary pointer for coefficient buffer */
 339   q15_t *px;                                     /* Temporary q31 pointer for SIMD state buffer accesses */
 340   q31_t x0, x1, x2, c0;                          /* Temporary variables to hold SIMD state and coefficient values */
 341   uint32_t numTaps = S->numTaps;                 /* Number of taps in the filter */
 342   uint32_t tapCnt, blkCnt;                       /* Loop counters */
 343
 344
 345   /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
 346   /* pStateCurnt points to the location where the new input data should be written */
 347   pStateCurnt = &(S->pState[(numTaps - 1u)]);
 348
 349   /* Apply loop unrolling and compute 4 output values simultaneously.
 350    * The variables acc0 ... acc3 hold output values that are being computed:
 351    *
 352    *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
 353    *    acc1 =  b[numTaps-1] * x[n-numTaps] +   b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
 354    *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] +   b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
 355    *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]
 356    */
 357
 358   blkCnt = blockSize >> 2;
 359
 360   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
 361    ** a second loop below computes the remaining 1 to 3 samples. */
 362   while(blkCnt > 0u)
 363   {
 364     /* Copy four new input samples into the state buffer.
 365      ** Use 32-bit SIMD to move the 16-bit data.  Only requires two copies. */
 366     *pStateCurnt++ = *pSrc++;
 367     *pStateCurnt++ = *pSrc++;
 368     *pStateCurnt++ = *pSrc++;
 369     *pStateCurnt++ = *pSrc++;
 370
 371
 372     /* Set all accumulators to zero */
 373     acc0 = 0;
 374     acc1 = 0;
 375     acc2 = 0;
 376     acc3 = 0;
 377
 378     /* Typecast q15_t pointer to q31_t pointer for state reading in q31_t */
 379     px = pState;
 380
 381     /* Typecast q15_t pointer to q31_t pointer for coefficient reading in q31_t */
 382     pb = pCoeffs;
 383
 384     /* Read the first two samples from the state buffer:  x[n-N], x[n-N-1] */
 385     x0 = *__SIMD32(px)++;
 386
 387     /* Read the third and forth samples from the state buffer: x[n-N-2], x[n-N-3] */
 388     x2 = *__SIMD32(px)++;
 389
 390     /* Loop over the number of taps.  Unroll by a factor of 4.
 391      ** Repeat until we've computed numTaps-(numTaps%4) coefficients. */
 392     tapCnt = numTaps >> 2;
 393
 394     while(tapCnt > 0)
 395     {
 396       /* Read the first two coefficients using SIMD:  b[N] and b[N-1] coefficients */
 397       c0 = *__SIMD32(pb)++;
 398
 399       /* acc0 +=  b[N] * x[n-N] + b[N-1] * x[n-N-1] */
 400       acc0 = __SMLALD(x0, c0, acc0);
 401
 402       /* acc2 +=  b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */
 403       acc2 = __SMLALD(x2, c0, acc2);
 404
 405       /* pack  x[n-N-1] and x[n-N-2] */
 406 #ifndef ARM_MATH_BIG_ENDIAN
 407       x1 = __PKHBT(x2, x0, 0);
 408 #else
 409       x1 = __PKHBT(x0, x2, 0);
 410 #endif
 411
 412       /* Read state x[n-N-4], x[n-N-5] */
 413       x0 = _SIMD32_OFFSET(px);
 414
 415       /* acc1 +=  b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */
 416       acc1 = __SMLALDX(x1, c0, acc1);
 417
 418       /* pack  x[n-N-3] and x[n-N-4] */
 419 #ifndef ARM_MATH_BIG_ENDIAN
 420       x1 = __PKHBT(x0, x2, 0);
 421 #else
 422       x1 = __PKHBT(x2, x0, 0);
 423 #endif
 424
 425       /* acc3 +=  b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */
 426       acc3 = __SMLALDX(x1, c0, acc3);
 427
 428       /* Read coefficients b[N-2], b[N-3] */
 429       c0 = *__SIMD32(pb)++;
 430
 431       /* acc0 +=  b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */
 432       acc0 = __SMLALD(x2, c0, acc0);
 433
 434       /* Read state x[n-N-6], x[n-N-7] with offset */
 435       x2 = _SIMD32_OFFSET(px + 2u);
 436
 437       /* acc2 +=  b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */
 438       acc2 = __SMLALD(x0, c0, acc2);
 439
 440       /* acc1 +=  b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */
 441       acc1 = __SMLALDX(x1, c0, acc1);
 442
 443       /* pack  x[n-N-5] and x[n-N-6] */
 444 #ifndef ARM_MATH_BIG_ENDIAN
 445       x1 = __PKHBT(x2, x0, 0);
 446 #else
 447       x1 = __PKHBT(x0, x2, 0);
 448 #endif
 449
 450       /* acc3 +=  b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */
 451       acc3 = __SMLALDX(x1, c0, acc3);
 452
 453       /* Update state pointer for next state reading */
 454       px += 4u;
 455
 456       /* Decrement tap count */
 457       tapCnt--;
 458
 459     }
 460
 461     /* If the filter length is not a multiple of 4, compute the remaining filter taps.
 462      ** This is always be 2 taps since the filter length is even. */
 463     if((numTaps & 0x3u) != 0u)
 464     {
 465
 466       /* Read last two coefficients */
 467       c0 = *__SIMD32(pb)++;
 468
 469       /* Perform the multiply-accumulates */
 470       acc0 = __SMLALD(x0, c0, acc0);
 471       acc2 = __SMLALD(x2, c0, acc2);
 472
 473       /* pack state variables */
 474 #ifndef ARM_MATH_BIG_ENDIAN
 475       x1 = __PKHBT(x2, x0, 0);
 476 #else
 477       x1 = __PKHBT(x0, x2, 0);
 478 #endif
 479
 480       /* Read last state variables */
 481       x0 = *__SIMD32(px);
 482
 483       /* Perform the multiply-accumulates */
 484       acc1 = __SMLALDX(x1, c0, acc1);
 485
 486       /* pack state variables */
 487 #ifndef ARM_MATH_BIG_ENDIAN
 488       x1 = __PKHBT(x0, x2, 0);
 489 #else
 490       x1 = __PKHBT(x2, x0, 0);
 491 #endif
 492
 493       /* Perform the multiply-accumulates */
 494       acc3 = __SMLALDX(x1, c0, acc3);
 495     }
 496
 497     /* The results in the 4 accumulators are in 2.30 format.  Convert to 1.15 with saturation.
 498      ** Then store the 4 outputs in the destination buffer. */
 499
 500 #ifndef ARM_MATH_BIG_ENDIAN
 501
 502     *__SIMD32(pDst)++ =
 503       __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
 504
 505     *__SIMD32(pDst)++ =
 506       __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
 507
 508 #else
 509
 510     *__SIMD32(pDst)++ =
 511       __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
 512
 513     *__SIMD32(pDst)++ =
 514       __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
 515
 516 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN       */
 517
 518     /* Advance the state pointer by 4 to process the next group of 4 samples */
 519     pState = pState + 4;
 520
 521     /* Decrement the loop counter */
 522     blkCnt--;
 523   }
 524
 525   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
 526    ** No loop unrolling is used. */
 527   blkCnt = blockSize % 0x4u;
 528   while(blkCnt > 0u)
 529   {
 530     /* Copy two samples into state buffer */
 531     *pStateCurnt++ = *pSrc++;
 532
 533     /* Set the accumulator to zero */
 534     acc0 = 0;
 535
 536     /* Use SIMD to hold states and coefficients */
 537     px = pState;
 538     pb = pCoeffs;
 539
 540     tapCnt = numTaps >> 1u;
 541
 542     do
 543     {
 544       acc0 += (q31_t) * px++ * *pb++;
 545           acc0 += (q31_t) * px++ * *pb++;
 546       tapCnt--;
 547     }
 548     while(tapCnt > 0u);
 549
 550     /* The result is in 2.30 format.  Convert to 1.15 with saturation.
 551      ** Then store the output in the destination buffer. */
 552     *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
 553
 554     /* Advance state pointer by 1 for the next sample */
 555     pState = pState + 1u;
 556
 557     /* Decrement the loop counter */
 558     blkCnt--;
 559   }
 560
 561   /* Processing is complete.
 562    ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
 563    ** This prepares the state buffer for the next function call. */
 564
 565   /* Points to the start of the state buffer */
 566   pStateCurnt = S->pState;
 567
 568   /* Calculation of count for copying integer writes */
 569   tapCnt = (numTaps - 1u) >> 2;
 570
 571   while(tapCnt > 0u)
 572   {
 573     *pStateCurnt++ = *pState++;
 574     *pStateCurnt++ = *pState++;
 575     *pStateCurnt++ = *pState++;
 576     *pStateCurnt++ = *pState++;
 577
 578     tapCnt--;
 579
 580   }
 581
 582   /* Calculation of count for remaining q15_t data */
 583   tapCnt = (numTaps - 1u) % 0x4u;
 584
 585   /* copy remaining data */
 586   while(tapCnt > 0u)
 587   {
 588     *pStateCurnt++ = *pState++;
 589
 590     /* Decrement the loop counter */
 591     tapCnt--;
 592   }
 593 }
 594
 595
 596 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
 597
 598 #else /* ARM_MATH_CM0_FAMILY */
 599
 600
 601 /* Run the below code for Cortex-M0 */
 602
 603 void arm_fir_q15(
 604   const arm_fir_instance_q15 * S,
 605   q15_t * pSrc,
 606   q15_t * pDst,
 607   uint32_t blockSize)
 608 {
 609   q15_t *pState = S->pState;                     /* State pointer */
 610   q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
 611   q15_t *pStateCurnt;                            /* Points to the current sample of the state */
 612
 613
 614
 615   q15_t *px;                                     /* Temporary pointer for state buffer */
 616   q15_t *pb;                                     /* Temporary pointer for coefficient buffer */
 617   q63_t acc;                                     /* Accumulator */
 618   uint32_t numTaps = S->numTaps;                 /* Number of nTaps in the filter */
 619   uint32_t tapCnt, blkCnt;                       /* Loop counters */
 620
 621   /* S->pState buffer contains previous frame (numTaps - 1) samples */
 622   /* pStateCurnt points to the location where the new input data should be written */
 623   pStateCurnt = &(S->pState[(numTaps - 1u)]);
 624
 625   /* Initialize blkCnt with blockSize */
 626   blkCnt = blockSize;
 627
 628   while(blkCnt > 0u)
 629   {
 630     /* Copy one sample at a time into state buffer */
 631     *pStateCurnt++ = *pSrc++;
 632
 633     /* Set the accumulator to zero */
 634     acc = 0;
 635
 636     /* Initialize state pointer */
 637     px = pState;
 638
 639     /* Initialize Coefficient pointer */
 640     pb = pCoeffs;
 641
 642     tapCnt = numTaps;
 643
 644     /* Perform the multiply-accumulates */
 645     do
 646     {
 647       /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
 648       acc += (q31_t) * px++ * *pb++;
 649       tapCnt--;
 650     } while(tapCnt > 0u);
 651
 652     /* The result is in 2.30 format.  Convert to 1.15
 653      ** Then store the output in the destination buffer. */
 654     *pDst++ = (q15_t) __SSAT((acc >> 15u), 16);
 655
 656     /* Advance state pointer by 1 for the next sample */
 657     pState = pState + 1;
 658
 659     /* Decrement the samples loop counter */
 660     blkCnt--;
 661   }
 662
 663   /* Processing is complete.
 664    ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
 665    ** This prepares the state buffer for the next function call. */
 666
 667   /* Points to the start of the state buffer */
 668   pStateCurnt = S->pState;
 669
 670   /* Copy numTaps number of values */
 671   tapCnt = (numTaps - 1u);
 672
 673   /* copy data */
 674   while(tapCnt > 0u)
 675   {
 676     *pStateCurnt++ = *pState++;
 677
 678     /* Decrement the loop counter */
 679     tapCnt--;
 680   }
 681
 682 }
 683
 684 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
 685
 686
 687
 688
 689 /**
 690  * @} end of FIR group
 691  */