tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_fir_sparse_q15.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        17. January 2013
   5 * $Revision:    V1.4.1
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:            arm_fir_sparse_q15.c
   9 *
  10 * Description:  Q15 sparse FIR filter processing function.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  13 *
  14 * Redistribution and use in source and binary forms, with or without
  15 * modification, are permitted provided that the following conditions
  16 * are met:
  17 *   - Redistributions of source code must retain the above copyright
  18 *     notice, this list of conditions and the following disclaimer.
  19 *   - Redistributions in binary form must reproduce the above copyright
  20 *     notice, this list of conditions and the following disclaimer in
  21 *     the documentation and/or other materials provided with the
  22 *     distribution.
  23 *   - Neither the name of ARM LIMITED nor the names of its contributors
  24 *     may be used to endorse or promote products derived from this
  25 *     software without specific prior written permission.
  26 *
  27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38 * POSSIBILITY OF SUCH DAMAGE.
  39 * ------------------------------------------------------------------- */
  40 #include "arm_math.h"
  41
  42 /**
  43  * @addtogroup FIR_Sparse
  44  * @{
  45  */
  46
  47 /**
  48  * @brief Processing function for the Q15 sparse FIR filter.
  49  * @param[in]  *S           points to an instance of the Q15 sparse FIR structure.
  50  * @param[in]  *pSrc        points to the block of input data.
  51  * @param[out] *pDst        points to the block of output data
  52  * @param[in]  *pScratchIn  points to a temporary buffer of size blockSize.
  53  * @param[in]  *pScratchOut points to a temporary buffer of size blockSize.
  54  * @param[in]  blockSize    number of input samples to process per call.
  55  * @return none.
  56  *
  57  * <b>Scaling and Overflow Behavior:</b>
  58  * \par
  59  * The function is implemented using an internal 32-bit accumulator.
  60  * The 1.15 x 1.15 multiplications yield a 2.30 result and these are added to a 2.30 accumulator.
  61  * Thus the full precision of the multiplications is maintained but there is only a single guard bit in the accumulator.
  62  * If the accumulator result overflows it will wrap around rather than saturate.
  63  * After all multiply-accumulates are performed, the 2.30 accumulator is truncated to 2.15 format and then saturated to 1.15 format.
  64  * In order to avoid overflows the input signal or coefficients must be scaled down by log2(numTaps) bits.
  65  */
  66
  67
  68 void arm_fir_sparse_q15(
  69   arm_fir_sparse_instance_q15 * S,
  70   q15_t * pSrc,
  71   q15_t * pDst,
  72   q15_t * pScratchIn,
  73   q31_t * pScratchOut,
  74   uint32_t blockSize)
  75 {
  76
  77   q15_t *pState = S->pState;                     /* State pointer */
  78   q15_t *pIn = pSrc;                             /* Working pointer for input */
  79   q15_t *pOut = pDst;                            /* Working pointer for output */
  80   q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
  81   q15_t *px;                                     /* Temporary pointers for scratch buffer */
  82   q15_t *pb = pScratchIn;                        /* Temporary pointers for scratch buffer */
  83   q15_t *py = pState;                            /* Temporary pointers for state buffer */
  84   int32_t *pTapDelay = S->pTapDelay;             /* Pointer to the array containing offset of the non-zero tap values. */
  85   uint32_t delaySize = S->maxDelay + blockSize;  /* state length */
  86   uint16_t numTaps = S->numTaps;                 /* Filter order */
  87   int32_t readIndex;                             /* Read index of the state buffer */
  88   uint32_t tapCnt, blkCnt;                       /* loop counters */
  89   q15_t coeff = *pCoeffs++;                      /* Read the first coefficient value */
  90   q31_t *pScr2 = pScratchOut;                    /* Working pointer for pScratchOut */
  91
  92
  93 #ifndef ARM_MATH_CM0_FAMILY
  94
  95   /* Run the below code for Cortex-M4 and Cortex-M3 */
  96
  97   q31_t in1, in2;                                /* Temporary variables */
  98
  99
 100   /* BlockSize of Input samples are copied into the state buffer */
 101   /* StateIndex points to the starting position to write in the state buffer */
 102   arm_circularWrite_q15(py, delaySize, &S->stateIndex, 1, pIn, 1, blockSize);
 103
 104   /* Loop over the number of taps. */
 105   tapCnt = numTaps;
 106
 107   /* Read Index, from where the state buffer should be read, is calculated. */
 108   readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
 109
 110   /* Wraparound of readIndex */
 111   if(readIndex < 0)
 112   {
 113     readIndex += (int32_t) delaySize;
 114   }
 115
 116   /* Working pointer for state buffer is updated */
 117   py = pState;
 118
 119   /* blockSize samples are read from the state buffer */
 120   arm_circularRead_q15(py, delaySize, &readIndex, 1,
 121                        pb, pb, blockSize, 1, blockSize);
 122
 123   /* Working pointer for the scratch buffer of state values */
 124   px = pb;
 125
 126   /* Working pointer for scratch buffer of output values */
 127   pScratchOut = pScr2;
 128
 129   /* Loop over the blockSize. Unroll by a factor of 4.
 130    * Compute 4 multiplications at a time. */
 131   blkCnt = blockSize >> 2;
 132
 133   while(blkCnt > 0u)
 134   {
 135     /* Perform multiplication and store in the scratch buffer */
 136     *pScratchOut++ = ((q31_t) * px++ * coeff);
 137     *pScratchOut++ = ((q31_t) * px++ * coeff);
 138     *pScratchOut++ = ((q31_t) * px++ * coeff);
 139     *pScratchOut++ = ((q31_t) * px++ * coeff);
 140
 141     /* Decrement the loop counter */
 142     blkCnt--;
 143   }
 144
 145   /* If the blockSize is not a multiple of 4,
 146    * compute the remaining samples */
 147   blkCnt = blockSize % 0x4u;
 148
 149   while(blkCnt > 0u)
 150   {
 151     /* Perform multiplication and store in the scratch buffer */
 152     *pScratchOut++ = ((q31_t) * px++ * coeff);
 153
 154     /* Decrement the loop counter */
 155     blkCnt--;
 156   }
 157
 158   /* Load the coefficient value and
 159    * increment the coefficient buffer for the next set of state values */
 160   coeff = *pCoeffs++;
 161
 162   /* Read Index, from where the state buffer should be read, is calculated. */
 163   readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
 164
 165   /* Wraparound of readIndex */
 166   if(readIndex < 0)
 167   {
 168     readIndex += (int32_t) delaySize;
 169   }
 170
 171   /* Loop over the number of taps. */
 172   tapCnt = (uint32_t) numTaps - 1u;
 173
 174   while(tapCnt > 0u)
 175   {
 176     /* Working pointer for state buffer is updated */
 177     py = pState;
 178
 179     /* blockSize samples are read from the state buffer */
 180     arm_circularRead_q15(py, delaySize, &readIndex, 1,
 181                          pb, pb, blockSize, 1, blockSize);
 182
 183     /* Working pointer for the scratch buffer of state values */
 184     px = pb;
 185
 186     /* Working pointer for scratch buffer of output values */
 187     pScratchOut = pScr2;
 188
 189     /* Loop over the blockSize. Unroll by a factor of 4.
 190      * Compute 4 MACS at a time. */
 191     blkCnt = blockSize >> 2;
 192
 193     while(blkCnt > 0u)
 194     {
 195       /* Perform Multiply-Accumulate */
 196       *pScratchOut++ += (q31_t) * px++ * coeff;
 197       *pScratchOut++ += (q31_t) * px++ * coeff;
 198       *pScratchOut++ += (q31_t) * px++ * coeff;
 199       *pScratchOut++ += (q31_t) * px++ * coeff;
 200
 201       /* Decrement the loop counter */
 202       blkCnt--;
 203     }
 204
 205     /* If the blockSize is not a multiple of 4,
 206      * compute the remaining samples */
 207     blkCnt = blockSize % 0x4u;
 208
 209     while(blkCnt > 0u)
 210     {
 211       /* Perform Multiply-Accumulate */
 212       *pScratchOut++ += (q31_t) * px++ * coeff;
 213
 214       /* Decrement the loop counter */
 215       blkCnt--;
 216     }
 217
 218     /* Load the coefficient value and
 219      * increment the coefficient buffer for the next set of state values */
 220     coeff = *pCoeffs++;
 221
 222     /* Read Index, from where the state buffer should be read, is calculated. */
 223     readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
 224
 225     /* Wraparound of readIndex */
 226     if(readIndex < 0)
 227     {
 228       readIndex += (int32_t) delaySize;
 229     }
 230
 231     /* Decrement the tap loop counter */
 232     tapCnt--;
 233   }
 234
 235   /* All the output values are in pScratchOut buffer.
 236      Convert them into 1.15 format, saturate and store in the destination buffer. */
 237   /* Loop over the blockSize. */
 238   blkCnt = blockSize >> 2;
 239
 240   while(blkCnt > 0u)
 241   {
 242     in1 = *pScr2++;
 243     in2 = *pScr2++;
 244
 245 #ifndef  ARM_MATH_BIG_ENDIAN
 246
 247     *__SIMD32(pOut)++ =
 248       __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16),
 249               16);
 250
 251 #else
 252     *__SIMD32(pOut)++ =
 253       __PKHBT((q15_t) __SSAT(in2 >> 15, 16), (q15_t) __SSAT(in1 >> 15, 16),
 254               16);
 255
 256 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
 257
 258     in1 = *pScr2++;
 259
 260     in2 = *pScr2++;
 261
 262 #ifndef  ARM_MATH_BIG_ENDIAN
 263
 264     *__SIMD32(pOut)++ =
 265       __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16),
 266               16);
 267
 268 #else
 269
 270     *__SIMD32(pOut)++ =
 271       __PKHBT((q15_t) __SSAT(in2 >> 15, 16), (q15_t) __SSAT(in1 >> 15, 16),
 272               16);
 273
 274 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
 275
 276
 277     blkCnt--;
 278
 279   }
 280
 281   /* If the blockSize is not a multiple of 4,
 282      remaining samples are processed in the below loop */
 283   blkCnt = blockSize % 0x4u;
 284
 285   while(blkCnt > 0u)
 286   {
 287     *pOut++ = (q15_t) __SSAT(*pScr2++ >> 15, 16);
 288     blkCnt--;
 289   }
 290
 291 #else
 292
 293   /* Run the below code for Cortex-M0 */
 294
 295   /* BlockSize of Input samples are copied into the state buffer */
 296   /* StateIndex points to the starting position to write in the state buffer */
 297   arm_circularWrite_q15(py, delaySize, &S->stateIndex, 1, pIn, 1, blockSize);
 298
 299   /* Loop over the number of taps. */
 300   tapCnt = numTaps;
 301
 302   /* Read Index, from where the state buffer should be read, is calculated. */
 303   readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
 304
 305   /* Wraparound of readIndex */
 306   if(readIndex < 0)
 307   {
 308     readIndex += (int32_t) delaySize;
 309   }
 310
 311   /* Working pointer for state buffer is updated */
 312   py = pState;
 313
 314   /* blockSize samples are read from the state buffer */
 315   arm_circularRead_q15(py, delaySize, &readIndex, 1,
 316                        pb, pb, blockSize, 1, blockSize);
 317
 318   /* Working pointer for the scratch buffer of state values */
 319   px = pb;
 320
 321   /* Working pointer for scratch buffer of output values */
 322   pScratchOut = pScr2;
 323
 324   blkCnt = blockSize;
 325
 326   while(blkCnt > 0u)
 327   {
 328     /* Perform multiplication and store in the scratch buffer */
 329     *pScratchOut++ = ((q31_t) * px++ * coeff);
 330
 331     /* Decrement the loop counter */
 332     blkCnt--;
 333   }
 334
 335   /* Load the coefficient value and
 336    * increment the coefficient buffer for the next set of state values */
 337   coeff = *pCoeffs++;
 338
 339   /* Read Index, from where the state buffer should be read, is calculated. */
 340   readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
 341
 342   /* Wraparound of readIndex */
 343   if(readIndex < 0)
 344   {
 345     readIndex += (int32_t) delaySize;
 346   }
 347
 348   /* Loop over the number of taps. */
 349   tapCnt = (uint32_t) numTaps - 1u;
 350
 351   while(tapCnt > 0u)
 352   {
 353     /* Working pointer for state buffer is updated */
 354     py = pState;
 355
 356     /* blockSize samples are read from the state buffer */
 357     arm_circularRead_q15(py, delaySize, &readIndex, 1,
 358                          pb, pb, blockSize, 1, blockSize);
 359
 360     /* Working pointer for the scratch buffer of state values */
 361     px = pb;
 362
 363     /* Working pointer for scratch buffer of output values */
 364     pScratchOut = pScr2;
 365
 366     blkCnt = blockSize;
 367
 368     while(blkCnt > 0u)
 369     {
 370       /* Perform Multiply-Accumulate */
 371       *pScratchOut++ += (q31_t) * px++ * coeff;
 372
 373       /* Decrement the loop counter */
 374       blkCnt--;
 375     }
 376
 377     /* Load the coefficient value and
 378      * increment the coefficient buffer for the next set of state values */
 379     coeff = *pCoeffs++;
 380
 381     /* Read Index, from where the state buffer should be read, is calculated. */
 382     readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
 383
 384     /* Wraparound of readIndex */
 385     if(readIndex < 0)
 386     {
 387       readIndex += (int32_t) delaySize;
 388     }
 389
 390     /* Decrement the tap loop counter */
 391     tapCnt--;
 392   }
 393
 394   /* All the output values are in pScratchOut buffer.
 395      Convert them into 1.15 format, saturate and store in the destination buffer. */
 396   /* Loop over the blockSize. */
 397   blkCnt = blockSize;
 398
 399   while(blkCnt > 0u)
 400   {
 401     *pOut++ = (q15_t) __SSAT(*pScr2++ >> 15, 16);
 402     blkCnt--;
 403   }
 404
 405 #endif /*   #ifndef ARM_MATH_CM0_FAMILY */
 406
 407 }
 408
 409 /**
 410  * @} end of FIR_Sparse group
 411  */