tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_correlate_q7.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        17. January 2013
   5 * $Revision:    V1.4.1
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:                arm_correlate_q7.c
   9 *
  10 * Description:  Correlation of Q7 sequences.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  13 *
  14 * Redistribution and use in source and binary forms, with or without
  15 * modification, are permitted provided that the following conditions
  16 * are met:
  17 *   - Redistributions of source code must retain the above copyright
  18 *     notice, this list of conditions and the following disclaimer.
  19 *   - Redistributions in binary form must reproduce the above copyright
  20 *     notice, this list of conditions and the following disclaimer in
  21 *     the documentation and/or other materials provided with the
  22 *     distribution.
  23 *   - Neither the name of ARM LIMITED nor the names of its contributors
  24 *     may be used to endorse or promote products derived from this
  25 *     software without specific prior written permission.
  26 *
  27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38 * POSSIBILITY OF SUCH DAMAGE.
  39 * -------------------------------------------------------------------- */
  40
  41 #include "arm_math.h"
  42
  43 /**
  44  * @ingroup groupFilters
  45  */
  46
  47 /**
  48  * @addtogroup Corr
  49  * @{
  50  */
  51
  52 /**
  53  * @brief Correlation of Q7 sequences.
  54  * @param[in] *pSrcA points to the first input sequence.
  55  * @param[in] srcALen length of the first input sequence.
  56  * @param[in] *pSrcB points to the second input sequence.
  57  * @param[in] srcBLen length of the second input sequence.
  58  * @param[out] *pDst points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
  59  * @return none.
  60  *
  61  * @details
  62  * <b>Scaling and Overflow Behavior:</b>
  63  *
  64  * \par
  65  * The function is implemented using a 32-bit internal accumulator.
  66  * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
  67  * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
  68  * This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.
  69  * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and saturated to 1.7 format.
  70  *
  71  * \par
  72  * Refer the function <code>arm_correlate_opt_q7()</code> for a faster implementation of this function.
  73  *
  74  */
  75
  76 void arm_correlate_q7(
  77   q7_t * pSrcA,
  78   uint32_t srcALen,
  79   q7_t * pSrcB,
  80   uint32_t srcBLen,
  81   q7_t * pDst)
  82 {
  83
  84
  85 #ifndef ARM_MATH_CM0_FAMILY
  86
  87   /* Run the below code for Cortex-M4 and Cortex-M3 */
  88
  89   q7_t *pIn1;                                    /* inputA pointer               */
  90   q7_t *pIn2;                                    /* inputB pointer               */
  91   q7_t *pOut = pDst;                             /* output pointer               */
  92   q7_t *px;                                      /* Intermediate inputA pointer  */
  93   q7_t *py;                                      /* Intermediate inputB pointer  */
  94   q7_t *pSrc1;                                   /* Intermediate pointers        */
  95   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulators                  */
  96   q31_t input1, input2;                          /* temporary variables */
  97   q15_t in1, in2;                                /* temporary variables */
  98   q7_t x0, x1, x2, x3, c0, c1;                   /* temporary variables for holding input and coefficient values */
  99   uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3;  /* loop counter                 */
 100   int32_t inc = 1;
 101
 102
 103   /* The algorithm implementation is based on the lengths of the inputs. */
 104   /* srcB is always made to slide across srcA. */
 105   /* So srcBLen is always considered as shorter or equal to srcALen */
 106   /* But CORR(x, y) is reverse of CORR(y, x) */
 107   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
 108   /* and the destination pointer modifier, inc is set to -1 */
 109   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
 110   /* But to improve the performance,
 111    * we include zeroes in the output instead of zero padding either of the the inputs*/
 112   /* If srcALen > srcBLen,
 113    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
 114   /* If srcALen < srcBLen,
 115    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
 116   if(srcALen >= srcBLen)
 117   {
 118     /* Initialization of inputA pointer */
 119     pIn1 = (pSrcA);
 120
 121     /* Initialization of inputB pointer */
 122     pIn2 = (pSrcB);
 123
 124     /* Number of output samples is calculated */
 125     outBlockSize = (2u * srcALen) - 1u;
 126
 127     /* When srcALen > srcBLen, zero padding is done to srcB
 128      * to make their lengths equal.
 129      * Instead, (outBlockSize - (srcALen + srcBLen - 1))
 130      * number of output samples are made zero */
 131     j = outBlockSize - (srcALen + (srcBLen - 1u));
 132
 133     /* Updating the pointer position to non zero value */
 134     pOut += j;
 135
 136   }
 137   else
 138   {
 139     /* Initialization of inputA pointer */
 140     pIn1 = (pSrcB);
 141
 142     /* Initialization of inputB pointer */
 143     pIn2 = (pSrcA);
 144
 145     /* srcBLen is always considered as shorter or equal to srcALen */
 146     j = srcBLen;
 147     srcBLen = srcALen;
 148     srcALen = j;
 149
 150     /* CORR(x, y) = Reverse order(CORR(y, x)) */
 151     /* Hence set the destination pointer to point to the last output sample */
 152     pOut = pDst + ((srcALen + srcBLen) - 2u);
 153
 154     /* Destination address modifier is set to -1 */
 155     inc = -1;
 156
 157   }
 158
 159   /* The function is internally
 160    * divided into three parts according to the number of multiplications that has to be
 161    * taken place between inputA samples and inputB samples. In the first part of the
 162    * algorithm, the multiplications increase by one for every iteration.
 163    * In the second part of the algorithm, srcBLen number of multiplications are done.
 164    * In the third part of the algorithm, the multiplications decrease by one
 165    * for every iteration.*/
 166   /* The algorithm is implemented in three stages.
 167    * The loop counters of each stage is initiated here. */
 168   blockSize1 = srcBLen - 1u;
 169   blockSize2 = srcALen - (srcBLen - 1u);
 170   blockSize3 = blockSize1;
 171
 172   /* --------------------------
 173    * Initializations of stage1
 174    * -------------------------*/
 175
 176   /* sum = x[0] * y[srcBlen - 1]
 177    * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
 178    * ....
 179    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
 180    */
 181
 182   /* In this stage the MAC operations are increased by 1 for every iteration.
 183      The count variable holds the number of MAC operations performed */
 184   count = 1u;
 185
 186   /* Working pointer of inputA */
 187   px = pIn1;
 188
 189   /* Working pointer of inputB */
 190   pSrc1 = pIn2 + (srcBLen - 1u);
 191   py = pSrc1;
 192
 193   /* ------------------------
 194    * Stage1 process
 195    * ----------------------*/
 196
 197   /* The first stage starts here */
 198   while(blockSize1 > 0u)
 199   {
 200     /* Accumulator is made zero for every iteration */
 201     sum = 0;
 202
 203     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 204     k = count >> 2;
 205
 206     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 207      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 208     while(k > 0u)
 209     {
 210       /* x[0] , x[1] */
 211       in1 = (q15_t) * px++;
 212       in2 = (q15_t) * px++;
 213       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 214
 215       /* y[srcBLen - 4] , y[srcBLen - 3] */
 216       in1 = (q15_t) * py++;
 217       in2 = (q15_t) * py++;
 218       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 219
 220       /* x[0] * y[srcBLen - 4] */
 221       /* x[1] * y[srcBLen - 3] */
 222       sum = __SMLAD(input1, input2, sum);
 223
 224       /* x[2] , x[3] */
 225       in1 = (q15_t) * px++;
 226       in2 = (q15_t) * px++;
 227       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 228
 229       /* y[srcBLen - 2] , y[srcBLen - 1] */
 230       in1 = (q15_t) * py++;
 231       in2 = (q15_t) * py++;
 232       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 233
 234       /* x[2] * y[srcBLen - 2] */
 235       /* x[3] * y[srcBLen - 1] */
 236       sum = __SMLAD(input1, input2, sum);
 237
 238
 239       /* Decrement the loop counter */
 240       k--;
 241     }
 242
 243     /* If the count is not a multiple of 4, compute any remaining MACs here.
 244      ** No loop unrolling is used. */
 245     k = count % 0x4u;
 246
 247     while(k > 0u)
 248     {
 249       /* Perform the multiply-accumulates */
 250       /* x[0] * y[srcBLen - 1] */
 251       sum += (q31_t) ((q15_t) * px++ * *py++);
 252
 253       /* Decrement the loop counter */
 254       k--;
 255     }
 256
 257     /* Store the result in the accumulator in the destination buffer. */
 258     *pOut = (q7_t) (__SSAT(sum >> 7, 8));
 259     /* Destination pointer is updated according to the address modifier, inc */
 260     pOut += inc;
 261
 262     /* Update the inputA and inputB pointers for next MAC calculation */
 263     py = pSrc1 - count;
 264     px = pIn1;
 265
 266     /* Increment the MAC count */
 267     count++;
 268
 269     /* Decrement the loop counter */
 270     blockSize1--;
 271   }
 272
 273   /* --------------------------
 274    * Initializations of stage2
 275    * ------------------------*/
 276
 277   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
 278    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
 279    * ....
 280    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 281    */
 282
 283   /* Working pointer of inputA */
 284   px = pIn1;
 285
 286   /* Working pointer of inputB */
 287   py = pIn2;
 288
 289   /* count is index by which the pointer pIn1 to be incremented */
 290   count = 0u;
 291
 292   /* -------------------
 293    * Stage2 process
 294    * ------------------*/
 295
 296   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
 297    * So, to loop unroll over blockSize2,
 298    * srcBLen should be greater than or equal to 4 */
 299   if(srcBLen >= 4u)
 300   {
 301     /* Loop unroll over blockSize2, by 4 */
 302     blkCnt = blockSize2 >> 2u;
 303
 304     while(blkCnt > 0u)
 305     {
 306       /* Set all accumulators to zero */
 307       acc0 = 0;
 308       acc1 = 0;
 309       acc2 = 0;
 310       acc3 = 0;
 311
 312       /* read x[0], x[1], x[2] samples */
 313       x0 = *px++;
 314       x1 = *px++;
 315       x2 = *px++;
 316
 317       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 318       k = srcBLen >> 2u;
 319
 320       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 321        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 322       do
 323       {
 324         /* Read y[0] sample */
 325         c0 = *py++;
 326         /* Read y[1] sample */
 327         c1 = *py++;
 328
 329         /* Read x[3] sample */
 330         x3 = *px++;
 331
 332         /* x[0] and x[1] are packed */
 333         in1 = (q15_t) x0;
 334         in2 = (q15_t) x1;
 335
 336         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 337
 338         /* y[0] and y[1] are packed */
 339         in1 = (q15_t) c0;
 340         in2 = (q15_t) c1;
 341
 342         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 343
 344         /* acc0 += x[0] * y[0] + x[1] * y[1]  */
 345         acc0 = __SMLAD(input1, input2, acc0);
 346
 347         /* x[1] and x[2] are packed */
 348         in1 = (q15_t) x1;
 349         in2 = (q15_t) x2;
 350
 351         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 352
 353         /* acc1 += x[1] * y[0] + x[2] * y[1] */
 354         acc1 = __SMLAD(input1, input2, acc1);
 355
 356         /* x[2] and x[3] are packed */
 357         in1 = (q15_t) x2;
 358         in2 = (q15_t) x3;
 359
 360         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 361
 362         /* acc2 += x[2] * y[0] + x[3] * y[1]  */
 363         acc2 = __SMLAD(input1, input2, acc2);
 364
 365         /* Read x[4] sample */
 366         x0 = *(px++);
 367
 368         /* x[3] and x[4] are packed */
 369         in1 = (q15_t) x3;
 370         in2 = (q15_t) x0;
 371
 372         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 373
 374         /* acc3 += x[3] * y[0] + x[4] * y[1]  */
 375         acc3 = __SMLAD(input1, input2, acc3);
 376
 377         /* Read y[2] sample */
 378         c0 = *py++;
 379         /* Read y[3] sample */
 380         c1 = *py++;
 381
 382         /* Read x[5] sample */
 383         x1 = *px++;
 384
 385         /* x[2] and x[3] are packed */
 386         in1 = (q15_t) x2;
 387         in2 = (q15_t) x3;
 388
 389         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 390
 391         /* y[2] and y[3] are packed */
 392         in1 = (q15_t) c0;
 393         in2 = (q15_t) c1;
 394
 395         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 396
 397         /* acc0 += x[2] * y[2] + x[3] * y[3]  */
 398         acc0 = __SMLAD(input1, input2, acc0);
 399
 400         /* x[3] and x[4] are packed */
 401         in1 = (q15_t) x3;
 402         in2 = (q15_t) x0;
 403
 404         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 405
 406         /* acc1 += x[3] * y[2] + x[4] * y[3]  */
 407         acc1 = __SMLAD(input1, input2, acc1);
 408
 409         /* x[4] and x[5] are packed */
 410         in1 = (q15_t) x0;
 411         in2 = (q15_t) x1;
 412
 413         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 414
 415         /* acc2 += x[4] * y[2] + x[5] * y[3]  */
 416         acc2 = __SMLAD(input1, input2, acc2);
 417
 418         /* Read x[6] sample */
 419         x2 = *px++;
 420
 421         /* x[5] and x[6] are packed */
 422         in1 = (q15_t) x1;
 423         in2 = (q15_t) x2;
 424
 425         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 426
 427         /* acc3 += x[5] * y[2] + x[6] * y[3]  */
 428         acc3 = __SMLAD(input1, input2, acc3);
 429
 430       } while(--k);
 431
 432       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 433        ** No loop unrolling is used. */
 434       k = srcBLen % 0x4u;
 435
 436       while(k > 0u)
 437       {
 438         /* Read y[4] sample */
 439         c0 = *py++;
 440
 441         /* Read x[7] sample */
 442         x3 = *px++;
 443
 444         /* Perform the multiply-accumulates */
 445         /* acc0 +=  x[4] * y[4] */
 446         acc0 += ((q15_t) x0 * c0);
 447         /* acc1 +=  x[5] * y[4] */
 448         acc1 += ((q15_t) x1 * c0);
 449         /* acc2 +=  x[6] * y[4] */
 450         acc2 += ((q15_t) x2 * c0);
 451         /* acc3 +=  x[7] * y[4] */
 452         acc3 += ((q15_t) x3 * c0);
 453
 454         /* Reuse the present samples for the next MAC */
 455         x0 = x1;
 456         x1 = x2;
 457         x2 = x3;
 458
 459         /* Decrement the loop counter */
 460         k--;
 461       }
 462
 463       /* Store the result in the accumulator in the destination buffer. */
 464       *pOut = (q7_t) (__SSAT(acc0 >> 7, 8));
 465       /* Destination pointer is updated according to the address modifier, inc */
 466       pOut += inc;
 467
 468       *pOut = (q7_t) (__SSAT(acc1 >> 7, 8));
 469       pOut += inc;
 470
 471       *pOut = (q7_t) (__SSAT(acc2 >> 7, 8));
 472       pOut += inc;
 473
 474       *pOut = (q7_t) (__SSAT(acc3 >> 7, 8));
 475       pOut += inc;
 476
 477           count += 4u;
 478       /* Update the inputA and inputB pointers for next MAC calculation */
 479       px = pIn1 + count;
 480       py = pIn2;
 481
 482       /* Decrement the loop counter */
 483       blkCnt--;
 484     }
 485
 486     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
 487      ** No loop unrolling is used. */
 488     blkCnt = blockSize2 % 0x4u;
 489
 490     while(blkCnt > 0u)
 491     {
 492       /* Accumulator is made zero for every iteration */
 493       sum = 0;
 494
 495       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 496       k = srcBLen >> 2u;
 497
 498       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 499        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 500       while(k > 0u)
 501       {
 502         /* Reading two inputs of SrcA buffer and packing */
 503         in1 = (q15_t) * px++;
 504         in2 = (q15_t) * px++;
 505         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 506
 507         /* Reading two inputs of SrcB buffer and packing */
 508         in1 = (q15_t) * py++;
 509         in2 = (q15_t) * py++;
 510         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 511
 512         /* Perform the multiply-accumulates */
 513         sum = __SMLAD(input1, input2, sum);
 514
 515         /* Reading two inputs of SrcA buffer and packing */
 516         in1 = (q15_t) * px++;
 517         in2 = (q15_t) * px++;
 518         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 519
 520         /* Reading two inputs of SrcB buffer and packing */
 521         in1 = (q15_t) * py++;
 522         in2 = (q15_t) * py++;
 523         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 524
 525         /* Perform the multiply-accumulates */
 526         sum = __SMLAD(input1, input2, sum);
 527
 528         /* Decrement the loop counter */
 529         k--;
 530       }
 531
 532       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 533        ** No loop unrolling is used. */
 534       k = srcBLen % 0x4u;
 535
 536       while(k > 0u)
 537       {
 538         /* Perform the multiply-accumulates */
 539         sum += ((q15_t) * px++ * *py++);
 540
 541         /* Decrement the loop counter */
 542         k--;
 543       }
 544
 545       /* Store the result in the accumulator in the destination buffer. */
 546       *pOut = (q7_t) (__SSAT(sum >> 7, 8));
 547       /* Destination pointer is updated according to the address modifier, inc */
 548       pOut += inc;
 549
 550       /* Increment the pointer pIn1 index, count by 1 */
 551           count++;
 552
 553       /* Update the inputA and inputB pointers for next MAC calculation */
 554       px = pIn1 + count;
 555       py = pIn2;
 556
 557       /* Decrement the loop counter */
 558       blkCnt--;
 559     }
 560   }
 561   else
 562   {
 563     /* If the srcBLen is not a multiple of 4,
 564      * the blockSize2 loop cannot be unrolled by 4 */
 565     blkCnt = blockSize2;
 566
 567     while(blkCnt > 0u)
 568     {
 569       /* Accumulator is made zero for every iteration */
 570       sum = 0;
 571
 572       /* Loop over srcBLen */
 573       k = srcBLen;
 574
 575       while(k > 0u)
 576       {
 577         /* Perform the multiply-accumulate */
 578         sum += ((q15_t) * px++ * *py++);
 579
 580         /* Decrement the loop counter */
 581         k--;
 582       }
 583
 584       /* Store the result in the accumulator in the destination buffer. */
 585       *pOut = (q7_t) (__SSAT(sum >> 7, 8));
 586       /* Destination pointer is updated according to the address modifier, inc */
 587       pOut += inc;
 588
 589       /* Increment the MAC count */
 590       count++;
 591
 592       /* Update the inputA and inputB pointers for next MAC calculation */
 593       px = pIn1 + count;
 594       py = pIn2;
 595
 596
 597       /* Decrement the loop counter */
 598       blkCnt--;
 599     }
 600   }
 601
 602   /* --------------------------
 603    * Initializations of stage3
 604    * -------------------------*/
 605
 606   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 607    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 608    * ....
 609    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
 610    * sum +=  x[srcALen-1] * y[0]
 611    */
 612
 613   /* In this stage the MAC operations are decreased by 1 for every iteration.
 614      The count variable holds the number of MAC operations performed */
 615   count = srcBLen - 1u;
 616
 617   /* Working pointer of inputA */
 618   pSrc1 = pIn1 + (srcALen - (srcBLen - 1u));
 619   px = pSrc1;
 620
 621   /* Working pointer of inputB */
 622   py = pIn2;
 623
 624   /* -------------------
 625    * Stage3 process
 626    * ------------------*/
 627
 628   while(blockSize3 > 0u)
 629   {
 630     /* Accumulator is made zero for every iteration */
 631     sum = 0;
 632
 633     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 634     k = count >> 2u;
 635
 636     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 637      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 638     while(k > 0u)
 639     {
 640       /* x[srcALen - srcBLen + 1] , x[srcALen - srcBLen + 2]  */
 641       in1 = (q15_t) * px++;
 642       in2 = (q15_t) * px++;
 643       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 644
 645       /* y[0] , y[1] */
 646       in1 = (q15_t) * py++;
 647       in2 = (q15_t) * py++;
 648       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 649
 650       /* sum += x[srcALen - srcBLen + 1] * y[0] */
 651       /* sum += x[srcALen - srcBLen + 2] * y[1] */
 652       sum = __SMLAD(input1, input2, sum);
 653
 654       /* x[srcALen - srcBLen + 3] , x[srcALen - srcBLen + 4] */
 655       in1 = (q15_t) * px++;
 656       in2 = (q15_t) * px++;
 657       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 658
 659       /* y[2] , y[3] */
 660       in1 = (q15_t) * py++;
 661       in2 = (q15_t) * py++;
 662       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 663
 664       /* sum += x[srcALen - srcBLen + 3] * y[2] */
 665       /* sum += x[srcALen - srcBLen + 4] * y[3] */
 666       sum = __SMLAD(input1, input2, sum);
 667
 668       /* Decrement the loop counter */
 669       k--;
 670     }
 671
 672     /* If the count is not a multiple of 4, compute any remaining MACs here.
 673      ** No loop unrolling is used. */
 674     k = count % 0x4u;
 675
 676     while(k > 0u)
 677     {
 678       /* Perform the multiply-accumulates */
 679       sum += ((q15_t) * px++ * *py++);
 680
 681       /* Decrement the loop counter */
 682       k--;
 683     }
 684
 685     /* Store the result in the accumulator in the destination buffer. */
 686     *pOut = (q7_t) (__SSAT(sum >> 7, 8));
 687     /* Destination pointer is updated according to the address modifier, inc */
 688     pOut += inc;
 689
 690     /* Update the inputA and inputB pointers for next MAC calculation */
 691     px = ++pSrc1;
 692     py = pIn2;
 693
 694     /* Decrement the MAC count */
 695     count--;
 696
 697     /* Decrement the loop counter */
 698     blockSize3--;
 699   }
 700
 701 #else
 702
 703 /* Run the below code for Cortex-M0 */
 704
 705   q7_t *pIn1 = pSrcA;                            /* inputA pointer */
 706   q7_t *pIn2 = pSrcB + (srcBLen - 1u);           /* inputB pointer */
 707   q31_t sum;                                     /* Accumulator */
 708   uint32_t i = 0u, j;                            /* loop counters */
 709   uint32_t inv = 0u;                             /* Reverse order flag */
 710   uint32_t tot = 0u;                             /* Length */
 711
 712   /* The algorithm implementation is based on the lengths of the inputs. */
 713   /* srcB is always made to slide across srcA. */
 714   /* So srcBLen is always considered as shorter or equal to srcALen */
 715   /* But CORR(x, y) is reverse of CORR(y, x) */
 716   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
 717   /* and a varaible, inv is set to 1 */
 718   /* If lengths are not equal then zero pad has to be done to  make the two
 719    * inputs of same length. But to improve the performance, we include zeroes
 720    * in the output instead of zero padding either of the the inputs*/
 721   /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
 722    * starting of the output buffer */
 723   /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
 724    * ending of the output buffer */
 725   /* Once the zero padding is done the remaining of the output is calcualted
 726    * using convolution but with the shorter signal time shifted. */
 727
 728   /* Calculate the length of the remaining sequence */
 729   tot = ((srcALen + srcBLen) - 2u);
 730
 731   if(srcALen > srcBLen)
 732   {
 733     /* Calculating the number of zeros to be padded to the output */
 734     j = srcALen - srcBLen;
 735
 736     /* Initialise the pointer after zero padding */
 737     pDst += j;
 738   }
 739
 740   else if(srcALen < srcBLen)
 741   {
 742     /* Initialization to inputB pointer */
 743     pIn1 = pSrcB;
 744
 745     /* Initialization to the end of inputA pointer */
 746     pIn2 = pSrcA + (srcALen - 1u);
 747
 748     /* Initialisation of the pointer after zero padding */
 749     pDst = pDst + tot;
 750
 751     /* Swapping the lengths */
 752     j = srcALen;
 753     srcALen = srcBLen;
 754     srcBLen = j;
 755
 756     /* Setting the reverse flag */
 757     inv = 1;
 758
 759   }
 760
 761   /* Loop to calculate convolution for output length number of times */
 762   for (i = 0u; i <= tot; i++)
 763   {
 764     /* Initialize sum with zero to carry on MAC operations */
 765     sum = 0;
 766
 767     /* Loop to perform MAC operations according to convolution equation */
 768     for (j = 0u; j <= i; j++)
 769     {
 770       /* Check the array limitations */
 771       if((((i - j) < srcBLen) && (j < srcALen)))
 772       {
 773         /* z[i] += x[i-j] * y[j] */
 774         sum += ((q15_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
 775       }
 776     }
 777     /* Store the output in the destination buffer */
 778     if(inv == 1)
 779       *pDst-- = (q7_t) __SSAT((sum >> 7u), 8u);
 780     else
 781       *pDst++ = (q7_t) __SSAT((sum >> 7u), 8u);
 782   }
 783
 784 #endif /*   #ifndef ARM_MATH_CM0_FAMILY */
 785
 786 }
 787
 788 /**
 789  * @} end of Corr group
 790  */