tmk_core/tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_conv_partial_q15.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        17. January 2013
   5 * $Revision:    V1.4.1
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:                arm_conv_partial_q15.c
   9 *
  10 * Description:  Partial convolution of Q15 sequences.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  13 *
  14 * Redistribution and use in source and binary forms, with or without
  15 * modification, are permitted provided that the following conditions
  16 * are met:
  17 *   - Redistributions of source code must retain the above copyright
  18 *     notice, this list of conditions and the following disclaimer.
  19 *   - Redistributions in binary form must reproduce the above copyright
  20 *     notice, this list of conditions and the following disclaimer in
  21 *     the documentation and/or other materials provided with the
  22 *     distribution.
  23 *   - Neither the name of ARM LIMITED nor the names of its contributors
  24 *     may be used to endorse or promote products derived from this
  25 *     software without specific prior written permission.
  26 *
  27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38 * POSSIBILITY OF SUCH DAMAGE.
  39 * -------------------------------------------------------------------- */
  40
  41 #include "arm_math.h"
  42
  43 /**
  44  * @ingroup groupFilters
  45  */
  46
  47 /**
  48  * @addtogroup PartialConv
  49  * @{
  50  */
  51
  52 /**
  53  * @brief Partial convolution of Q15 sequences.
  54  * @param[in]       *pSrcA points to the first input sequence.
  55  * @param[in]       srcALen length of the first input sequence.
  56  * @param[in]       *pSrcB points to the second input sequence.
  57  * @param[in]       srcBLen length of the second input sequence.
  58  * @param[out]      *pDst points to the location where the output result is written.
  59  * @param[in]       firstIndex is the first output sample to start with.
  60  * @param[in]       numPoints is the number of output points to be computed.
  61  * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
  62  *
  63  * Refer to <code>arm_conv_partial_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
  64  *
  65  * \par
  66  * Refer the function <code>arm_conv_partial_opt_q15()</code> for a faster implementation of this function using scratch buffers.
  67  *
  68  */
  69
  70
  71 arm_status arm_conv_partial_q15(
  72   q15_t * pSrcA,
  73   uint32_t srcALen,
  74   q15_t * pSrcB,
  75   uint32_t srcBLen,
  76   q15_t * pDst,
  77   uint32_t firstIndex,
  78   uint32_t numPoints)
  79 {
  80
  81 #if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)
  82
  83   /* Run the below code for Cortex-M4 and Cortex-M3 */
  84
  85   q15_t *pIn1;                                   /* inputA pointer               */
  86   q15_t *pIn2;                                   /* inputB pointer               */
  87   q15_t *pOut = pDst;                            /* output pointer               */
  88   q63_t sum, acc0, acc1, acc2, acc3;             /* Accumulator                  */
  89   q15_t *px;                                     /* Intermediate inputA pointer  */
  90   q15_t *py;                                     /* Intermediate inputB pointer  */
  91   q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers        */
  92   q31_t x0, x1, x2, x3, c0;                      /* Temporary input variables */
  93   uint32_t j, k, count, check, blkCnt;
  94   int32_t blockSize1, blockSize2, blockSize3;    /* loop counter                 */
  95   arm_status status;                             /* status of Partial convolution */
  96
  97   /* Check for range of output samples to be calculated */
  98   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
  99   {
 100     /* Set status as ARM_MATH_ARGUMENT_ERROR */
 101     status = ARM_MATH_ARGUMENT_ERROR;
 102   }
 103   else
 104   {
 105
 106     /* The algorithm implementation is based on the lengths of the inputs. */
 107     /* srcB is always made to slide across srcA. */
 108     /* So srcBLen is always considered as shorter or equal to srcALen */
 109     if(srcALen >= srcBLen)
 110     {
 111       /* Initialization of inputA pointer */
 112       pIn1 = pSrcA;
 113
 114       /* Initialization of inputB pointer */
 115       pIn2 = pSrcB;
 116     }
 117     else
 118     {
 119       /* Initialization of inputA pointer */
 120       pIn1 = pSrcB;
 121
 122       /* Initialization of inputB pointer */
 123       pIn2 = pSrcA;
 124
 125       /* srcBLen is always considered as shorter or equal to srcALen */
 126       j = srcBLen;
 127       srcBLen = srcALen;
 128       srcALen = j;
 129     }
 130
 131     /* Conditions to check which loopCounter holds
 132      * the first and last indices of the output samples to be calculated. */
 133     check = firstIndex + numPoints;
 134     blockSize3 = ((int32_t) check - (int32_t) srcALen);
 135     blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
 136     blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
 137     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
 138                                      (int32_t) numPoints) : 0;
 139     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
 140                                     (int32_t) firstIndex);
 141     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
 142
 143     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
 144     /* The function is internally
 145      * divided into three stages according to the number of multiplications that has to be
 146      * taken place between inputA samples and inputB samples. In the first stage of the
 147      * algorithm, the multiplications increase by one for every iteration.
 148      * In the second stage of the algorithm, srcBLen number of multiplications are done.
 149      * In the third stage of the algorithm, the multiplications decrease by one
 150      * for every iteration. */
 151
 152     /* Set the output pointer to point to the firstIndex
 153      * of the output sample to be calculated. */
 154     pOut = pDst + firstIndex;
 155
 156     /* --------------------------
 157      * Initializations of stage1
 158      * -------------------------*/
 159
 160     /* sum = x[0] * y[0]
 161      * sum = x[0] * y[1] + x[1] * y[0]
 162      * ....
 163      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
 164      */
 165
 166     /* In this stage the MAC operations are increased by 1 for every iteration.
 167        The count variable holds the number of MAC operations performed.
 168        Since the partial convolution starts from firstIndex
 169        Number of Macs to be performed is firstIndex + 1 */
 170     count = 1u + firstIndex;
 171
 172     /* Working pointer of inputA */
 173     px = pIn1;
 174
 175     /* Working pointer of inputB */
 176     pSrc2 = pIn2 + firstIndex;
 177     py = pSrc2;
 178
 179     /* ------------------------
 180      * Stage1 process
 181      * ----------------------*/
 182
 183     /* For loop unrolling by 4, this stage is divided into two. */
 184     /* First part of this stage computes the MAC operations less than 4 */
 185     /* Second part of this stage computes the MAC operations greater than or equal to 4 */
 186
 187     /* The first part of the stage starts here */
 188     while((count < 4u) && (blockSize1 > 0))
 189     {
 190       /* Accumulator is made zero for every iteration */
 191       sum = 0;
 192
 193       /* Loop over number of MAC operations between
 194        * inputA samples and inputB samples */
 195       k = count;
 196
 197       while(k > 0u)
 198       {
 199         /* Perform the multiply-accumulates */
 200         sum = __SMLALD(*px++, *py--, sum);
 201
 202         /* Decrement the loop counter */
 203         k--;
 204       }
 205
 206       /* Store the result in the accumulator in the destination buffer. */
 207       *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
 208
 209       /* Update the inputA and inputB pointers for next MAC calculation */
 210       py = ++pSrc2;
 211       px = pIn1;
 212
 213       /* Increment the MAC count */
 214       count++;
 215
 216       /* Decrement the loop counter */
 217       blockSize1--;
 218     }
 219
 220     /* The second part of the stage starts here */
 221     /* The internal loop, over count, is unrolled by 4 */
 222     /* To, read the last two inputB samples using SIMD:
 223      * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
 224     py = py - 1;
 225
 226     while(blockSize1 > 0)
 227     {
 228       /* Accumulator is made zero for every iteration */
 229       sum = 0;
 230
 231       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 232       k = count >> 2u;
 233
 234       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 235        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 236       while(k > 0u)
 237       {
 238         /* Perform the multiply-accumulates */
 239         /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
 240         sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
 241         /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
 242         sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
 243
 244         /* Decrement the loop counter */
 245         k--;
 246       }
 247
 248       /* For the next MAC operations, the pointer py is used without SIMD
 249        * So, py is incremented by 1 */
 250       py = py + 1u;
 251
 252       /* If the count is not a multiple of 4, compute any remaining MACs here.
 253        ** No loop unrolling is used. */
 254       k = count % 0x4u;
 255
 256       while(k > 0u)
 257       {
 258         /* Perform the multiply-accumulates */
 259         sum = __SMLALD(*px++, *py--, sum);
 260
 261         /* Decrement the loop counter */
 262         k--;
 263       }
 264
 265       /* Store the result in the accumulator in the destination buffer. */
 266       *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
 267
 268       /* Update the inputA and inputB pointers for next MAC calculation */
 269       py = ++pSrc2 - 1u;
 270       px = pIn1;
 271
 272       /* Increment the MAC count */
 273       count++;
 274
 275       /* Decrement the loop counter */
 276       blockSize1--;
 277     }
 278
 279     /* --------------------------
 280      * Initializations of stage2
 281      * ------------------------*/
 282
 283     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
 284      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
 285      * ....
 286      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
 287      */
 288
 289     /* Working pointer of inputA */
 290     px = pIn1;
 291
 292     /* Working pointer of inputB */
 293     pSrc2 = pIn2 + (srcBLen - 1u);
 294     py = pSrc2;
 295
 296   /* count is the index by which the pointer pIn1 to be incremented */
 297   count = 0u;
 298
 299
 300   /* --------------------
 301    * Stage2 process
 302    * -------------------*/
 303
 304   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
 305    * So, to loop unroll over blockSize2,
 306    * srcBLen should be greater than or equal to 4 */
 307   if(srcBLen >= 4u)
 308   {
 309     /* Loop unroll over blockSize2, by 4 */
 310     blkCnt = blockSize2 >> 2u;
 311
 312     while(blkCnt > 0u)
 313     {
 314       py = py - 1u;
 315
 316       /* Set all accumulators to zero */
 317       acc0 = 0;
 318       acc1 = 0;
 319       acc2 = 0;
 320       acc3 = 0;
 321
 322
 323       /* read x[0], x[1] samples */
 324       x0 = *__SIMD32(px);
 325       /* read x[1], x[2] samples */
 326       x1 = _SIMD32_OFFSET(px+1);
 327           px+= 2u;
 328
 329
 330       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 331       k = srcBLen >> 2u;
 332
 333       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 334        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 335       do
 336       {
 337         /* Read the last two inputB samples using SIMD:
 338          * y[srcBLen - 1] and y[srcBLen - 2] */
 339         c0 = *__SIMD32(py)--;
 340
 341         /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
 342         acc0 = __SMLALDX(x0, c0, acc0);
 343
 344         /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
 345         acc1 = __SMLALDX(x1, c0, acc1);
 346
 347         /* Read x[2], x[3] */
 348         x2 = *__SIMD32(px);
 349
 350         /* Read x[3], x[4] */
 351         x3 = _SIMD32_OFFSET(px+1);
 352
 353         /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
 354         acc2 = __SMLALDX(x2, c0, acc2);
 355
 356         /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
 357         acc3 = __SMLALDX(x3, c0, acc3);
 358
 359         /* Read y[srcBLen - 3] and y[srcBLen - 4] */
 360         c0 = *__SIMD32(py)--;
 361
 362         /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
 363         acc0 = __SMLALDX(x2, c0, acc0);
 364
 365         /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
 366         acc1 = __SMLALDX(x3, c0, acc1);
 367
 368         /* Read x[4], x[5] */
 369         x0 = _SIMD32_OFFSET(px+2);
 370
 371         /* Read x[5], x[6] */
 372         x1 = _SIMD32_OFFSET(px+3);
 373                 px += 4u;
 374
 375         /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
 376         acc2 = __SMLALDX(x0, c0, acc2);
 377
 378         /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
 379         acc3 = __SMLALDX(x1, c0, acc3);
 380
 381       } while(--k);
 382
 383       /* For the next MAC operations, SIMD is not used
 384        * So, the 16 bit pointer if inputB, py is updated */
 385
 386       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 387        ** No loop unrolling is used. */
 388       k = srcBLen % 0x4u;
 389
 390       if(k == 1u)
 391       {
 392         /* Read y[srcBLen - 5] */
 393         c0 = *(py+1);
 394
 395 #ifdef  ARM_MATH_BIG_ENDIAN
 396
 397         c0 = c0 << 16u;
 398
 399 #else
 400
 401         c0 = c0 & 0x0000FFFF;
 402
 403 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
 404
 405         /* Read x[7] */
 406         x3 = *__SIMD32(px);
 407                 px++;
 408
 409         /* Perform the multiply-accumulates */
 410         acc0 = __SMLALD(x0, c0, acc0);
 411         acc1 = __SMLALD(x1, c0, acc1);
 412         acc2 = __SMLALDX(x1, c0, acc2);
 413         acc3 = __SMLALDX(x3, c0, acc3);
 414       }
 415
 416       if(k == 2u)
 417       {
 418         /* Read y[srcBLen - 5], y[srcBLen - 6] */
 419         c0 = _SIMD32_OFFSET(py);
 420
 421         /* Read x[7], x[8] */
 422         x3 = *__SIMD32(px);
 423
 424         /* Read x[9] */
 425         x2 = _SIMD32_OFFSET(px+1);
 426                 px += 2u;
 427
 428         /* Perform the multiply-accumulates */
 429         acc0 = __SMLALDX(x0, c0, acc0);
 430         acc1 = __SMLALDX(x1, c0, acc1);
 431         acc2 = __SMLALDX(x3, c0, acc2);
 432         acc3 = __SMLALDX(x2, c0, acc3);
 433       }
 434
 435       if(k == 3u)
 436       {
 437         /* Read y[srcBLen - 5], y[srcBLen - 6] */
 438         c0 = _SIMD32_OFFSET(py);
 439
 440         /* Read x[7], x[8] */
 441         x3 = *__SIMD32(px);
 442
 443         /* Read x[9] */
 444         x2 = _SIMD32_OFFSET(px+1);
 445
 446         /* Perform the multiply-accumulates */
 447         acc0 = __SMLALDX(x0, c0, acc0);
 448         acc1 = __SMLALDX(x1, c0, acc1);
 449         acc2 = __SMLALDX(x3, c0, acc2);
 450         acc3 = __SMLALDX(x2, c0, acc3);
 451
 452                 c0 = *(py-1);
 453
 454 #ifdef  ARM_MATH_BIG_ENDIAN
 455
 456         c0 = c0 << 16u;
 457 #else
 458
 459         c0 = c0 & 0x0000FFFF;
 460 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
 461
 462         /* Read x[10] */
 463         x3 =  _SIMD32_OFFSET(px+2);
 464                 px += 3u;
 465
 466         /* Perform the multiply-accumulates */
 467         acc0 = __SMLALDX(x1, c0, acc0);
 468         acc1 = __SMLALD(x2, c0, acc1);
 469         acc2 = __SMLALDX(x2, c0, acc2);
 470         acc3 = __SMLALDX(x3, c0, acc3);
 471       }
 472
 473
 474       /* Store the results in the accumulators in the destination buffer. */
 475
 476 #ifndef  ARM_MATH_BIG_ENDIAN
 477
 478       *__SIMD32(pOut)++ =
 479         __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
 480       *__SIMD32(pOut)++ =
 481         __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
 482
 483 #else
 484
 485       *__SIMD32(pOut)++ =
 486         __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
 487       *__SIMD32(pOut)++ =
 488         __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
 489
 490 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
 491
 492       /* Increment the pointer pIn1 index, count by 4 */
 493       count += 4u;
 494
 495       /* Update the inputA and inputB pointers for next MAC calculation */
 496       px = pIn1 + count;
 497       py = pSrc2;
 498
 499         /* Decrement the loop counter */
 500         blkCnt--;
 501       }
 502
 503       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
 504        ** No loop unrolling is used. */
 505       blkCnt = (uint32_t) blockSize2 % 0x4u;
 506
 507       while(blkCnt > 0u)
 508       {
 509         /* Accumulator is made zero for every iteration */
 510         sum = 0;
 511
 512         /* Apply loop unrolling and compute 4 MACs simultaneously. */
 513         k = srcBLen >> 2u;
 514
 515         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 516          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 517         while(k > 0u)
 518         {
 519           /* Perform the multiply-accumulates */
 520           sum += (q63_t) ((q31_t) * px++ * *py--);
 521           sum += (q63_t) ((q31_t) * px++ * *py--);
 522           sum += (q63_t) ((q31_t) * px++ * *py--);
 523           sum += (q63_t) ((q31_t) * px++ * *py--);
 524
 525           /* Decrement the loop counter */
 526           k--;
 527         }
 528
 529         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 530          ** No loop unrolling is used. */
 531         k = srcBLen % 0x4u;
 532
 533         while(k > 0u)
 534         {
 535           /* Perform the multiply-accumulates */
 536           sum += (q63_t) ((q31_t) * px++ * *py--);
 537
 538           /* Decrement the loop counter */
 539           k--;
 540         }
 541
 542         /* Store the result in the accumulator in the destination buffer. */
 543         *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
 544
 545         /* Increment the pointer pIn1 index, count by 1 */
 546         count++;
 547
 548         /* Update the inputA and inputB pointers for next MAC calculation */
 549         px = pIn1 + count;
 550         py = pSrc2;
 551
 552         /* Decrement the loop counter */
 553         blkCnt--;
 554       }
 555     }
 556     else
 557     {
 558       /* If the srcBLen is not a multiple of 4,
 559        * the blockSize2 loop cannot be unrolled by 4 */
 560       blkCnt = (uint32_t) blockSize2;
 561
 562       while(blkCnt > 0u)
 563       {
 564         /* Accumulator is made zero for every iteration */
 565         sum = 0;
 566
 567         /* srcBLen number of MACS should be performed */
 568         k = srcBLen;
 569
 570         while(k > 0u)
 571         {
 572           /* Perform the multiply-accumulate */
 573           sum += (q63_t) ((q31_t) * px++ * *py--);
 574
 575           /* Decrement the loop counter */
 576           k--;
 577         }
 578
 579         /* Store the result in the accumulator in the destination buffer. */
 580         *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
 581
 582         /* Increment the MAC count */
 583         count++;
 584
 585         /* Update the inputA and inputB pointers for next MAC calculation */
 586         px = pIn1 + count;
 587         py = pSrc2;
 588
 589         /* Decrement the loop counter */
 590         blkCnt--;
 591       }
 592     }
 593
 594
 595     /* --------------------------
 596      * Initializations of stage3
 597      * -------------------------*/
 598
 599     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
 600      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
 601      * ....
 602      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
 603      * sum +=  x[srcALen-1] * y[srcBLen-1]
 604      */
 605
 606     /* In this stage the MAC operations are decreased by 1 for every iteration.
 607        The count variable holds the number of MAC operations performed */
 608     count = srcBLen - 1u;
 609
 610     /* Working pointer of inputA */
 611     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
 612     px = pSrc1;
 613
 614     /* Working pointer of inputB */
 615     pSrc2 = pIn2 + (srcBLen - 1u);
 616     pIn2 = pSrc2 - 1u;
 617     py = pIn2;
 618
 619     /* -------------------
 620      * Stage3 process
 621      * ------------------*/
 622
 623     /* For loop unrolling by 4, this stage is divided into two. */
 624     /* First part of this stage computes the MAC operations greater than 4 */
 625     /* Second part of this stage computes the MAC operations less than or equal to 4 */
 626
 627     /* The first part of the stage starts here */
 628     j = count >> 2u;
 629
 630     while((j > 0u) && (blockSize3 > 0))
 631     {
 632       /* Accumulator is made zero for every iteration */
 633       sum = 0;
 634
 635       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 636       k = count >> 2u;
 637
 638       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 639        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 640       while(k > 0u)
 641       {
 642         /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
 643          * with y[srcBLen - 1], y[srcBLen - 2] respectively */
 644         sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
 645         /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
 646          * with y[srcBLen - 3], y[srcBLen - 4] respectively */
 647         sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
 648
 649         /* Decrement the loop counter */
 650         k--;
 651       }
 652
 653       /* For the next MAC operations, the pointer py is used without SIMD
 654        * So, py is incremented by 1 */
 655       py = py + 1u;
 656
 657       /* If the count is not a multiple of 4, compute any remaining MACs here.
 658        ** No loop unrolling is used. */
 659       k = count % 0x4u;
 660
 661       while(k > 0u)
 662       {
 663         /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
 664         sum = __SMLALD(*px++, *py--, sum);
 665
 666         /* Decrement the loop counter */
 667         k--;
 668       }
 669
 670       /* Store the result in the accumulator in the destination buffer. */
 671       *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
 672
 673       /* Update the inputA and inputB pointers for next MAC calculation */
 674       px = ++pSrc1;
 675       py = pIn2;
 676
 677       /* Decrement the MAC count */
 678       count--;
 679
 680       /* Decrement the loop counter */
 681       blockSize3--;
 682
 683       j--;
 684     }
 685
 686     /* The second part of the stage starts here */
 687     /* SIMD is not used for the next MAC operations,
 688      * so pointer py is updated to read only one sample at a time */
 689     py = py + 1u;
 690
 691     while(blockSize3 > 0)
 692     {
 693       /* Accumulator is made zero for every iteration */
 694       sum = 0;
 695
 696       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 697       k = count;
 698
 699       while(k > 0u)
 700       {
 701         /* Perform the multiply-accumulates */
 702         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
 703         sum = __SMLALD(*px++, *py--, sum);
 704
 705         /* Decrement the loop counter */
 706         k--;
 707       }
 708
 709       /* Store the result in the accumulator in the destination buffer. */
 710       *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
 711
 712       /* Update the inputA and inputB pointers for next MAC calculation */
 713       px = ++pSrc1;
 714       py = pSrc2;
 715
 716       /* Decrement the MAC count */
 717       count--;
 718
 719       /* Decrement the loop counter */
 720       blockSize3--;
 721     }
 722
 723     /* set status as ARM_MATH_SUCCESS */
 724     status = ARM_MATH_SUCCESS;
 725   }
 726
 727   /* Return to application */
 728   return (status);
 729
 730 #else
 731
 732   /* Run the below code for Cortex-M0 */
 733
 734   q15_t *pIn1 = pSrcA;                           /* inputA pointer */
 735   q15_t *pIn2 = pSrcB;                           /* inputB pointer */
 736   q63_t sum;                                     /* Accumulator */
 737   uint32_t i, j;                                 /* loop counters */
 738   arm_status status;                             /* status of Partial convolution */
 739
 740   /* Check for range of output samples to be calculated */
 741   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
 742   {
 743     /* Set status as ARM_ARGUMENT_ERROR */
 744     status = ARM_MATH_ARGUMENT_ERROR;
 745   }
 746   else
 747   {
 748     /* Loop to calculate convolution for output length number of values */
 749     for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
 750     {
 751       /* Initialize sum with zero to carry on MAC operations */
 752       sum = 0;
 753
 754       /* Loop to perform MAC operations according to convolution equation */
 755       for (j = 0; j <= i; j++)
 756       {
 757         /* Check the array limitations */
 758         if(((i - j) < srcBLen) && (j < srcALen))
 759         {
 760           /* z[i] += x[i-j] * y[j] */
 761           sum += ((q31_t) pIn1[j] * (pIn2[i - j]));
 762         }
 763       }
 764
 765       /* Store the output in the destination buffer */
 766       pDst[i] = (q15_t) __SSAT((sum >> 15u), 16u);
 767     }
 768     /* set status as ARM_SUCCESS as there are no argument errors */
 769     status = ARM_MATH_SUCCESS;
 770   }
 771   return (status);
 772
 773 #endif /* #if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)  */
 774
 775 }
 776
 777 /**
 778  * @} end of PartialConv group
 779  */