tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_correlate_f32.c

   1 /* ----------------------------------------------------------------------------
   2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        17. January 2013
   5 * $Revision:    V1.4.1
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:                arm_correlate_f32.c
   9 *
  10 * Description:   Correlation of floating-point sequences.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  13 *
  14 * Redistribution and use in source and binary forms, with or without
  15 * modification, are permitted provided that the following conditions
  16 * are met:
  17 *   - Redistributions of source code must retain the above copyright
  18 *     notice, this list of conditions and the following disclaimer.
  19 *   - Redistributions in binary form must reproduce the above copyright
  20 *     notice, this list of conditions and the following disclaimer in
  21 *     the documentation and/or other materials provided with the
  22 *     distribution.
  23 *   - Neither the name of ARM LIMITED nor the names of its contributors
  24 *     may be used to endorse or promote products derived from this
  25 *     software without specific prior written permission.
  26 *
  27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38 * POSSIBILITY OF SUCH DAMAGE.
  39 * -------------------------------------------------------------------------- */
  40
  41 #include "arm_math.h"
  42
  43 /**
  44  * @ingroup groupFilters
  45  */
  46
  47 /**
  48  * @defgroup Corr Correlation
  49  *
  50  * Correlation is a mathematical operation that is similar to convolution.
  51  * As with convolution, correlation uses two signals to produce a third signal.
  52  * The underlying algorithms in correlation and convolution are identical except that one of the inputs is flipped in convolution.
  53  * Correlation is commonly used to measure the similarity between two signals.
  54  * It has applications in pattern recognition, cryptanalysis, and searching.
  55  * The CMSIS library provides correlation functions for Q7, Q15, Q31 and floating-point data types.
  56  * Fast versions of the Q15 and Q31 functions are also provided.
  57  *
  58  * \par Algorithm
  59  * Let <code>a[n]</code> and <code>b[n]</code> be sequences of length <code>srcALen</code> and <code>srcBLen</code> samples respectively.
  60  * The convolution of the two signals is denoted by
  61  * <pre>
  62  *                   c[n] = a[n] * b[n]
  63  * </pre>
  64  * In correlation, one of the signals is flipped in time
  65  * <pre>
  66  *                   c[n] = a[n] * b[-n]
  67  * </pre>
  68  *
  69  * \par
  70  * and this is mathematically defined as
  71  * \image html CorrelateEquation.gif
  72  * \par
  73  * The <code>pSrcA</code> points to the first input vector of length <code>srcALen</code> and <code>pSrcB</code> points to the second input vector of length <code>srcBLen</code>.
  74  * The result <code>c[n]</code> is of length <code>2 * max(srcALen, srcBLen) - 1</code> and is defined over the interval <code>n=0, 1, 2, ..., (2 * max(srcALen, srcBLen) - 2)</code>.
  75  * The output result is written to <code>pDst</code> and the calling function must allocate <code>2 * max(srcALen, srcBLen) - 1</code> words for the result.
  76  *
  77  * <b>Note</b>
  78  * \par
  79  * The <code>pDst</code> should be initialized to all zeros before being used.
  80  *
  81  * <b>Fixed-Point Behavior</b>
  82  * \par
  83  * Correlation requires summing up a large number of intermediate products.
  84  * As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
  85  * Refer to the function specific documentation below for further details of the particular algorithm used.
  86  *
  87  *
  88  * <b>Fast Versions</b>
  89  *
  90  * \par
  91  * Fast versions are supported for Q31 and Q15.  Cycles for Fast versions are less compared to Q31 and Q15 of correlate and the design requires
  92  * the input signals should be scaled down to avoid intermediate overflows.
  93  *
  94  *
  95  * <b>Opt Versions</b>
  96  *
  97  * \par
  98  * Opt versions are supported for Q15 and Q7.  Design uses internal scratch buffer for getting good optimisation.
  99  * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions of correlate
 100  */
 101
 102 /**
 103  * @addtogroup Corr
 104  * @{
 105  */
 106 /**
 107  * @brief Correlation of floating-point sequences.
 108  * @param[in]  *pSrcA points to the first input sequence.
 109  * @param[in]  srcALen length of the first input sequence.
 110  * @param[in]  *pSrcB points to the second input sequence.
 111  * @param[in]  srcBLen length of the second input sequence.
 112  * @param[out] *pDst points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
 113  * @return none.
 114  */
 115
 116 void arm_correlate_f32(
 117   float32_t * pSrcA,
 118   uint32_t srcALen,
 119   float32_t * pSrcB,
 120   uint32_t srcBLen,
 121   float32_t * pDst)
 122 {
 123
 124
 125 #ifndef ARM_MATH_CM0_FAMILY
 126
 127   /* Run the below code for Cortex-M4 and Cortex-M3 */
 128
 129   float32_t *pIn1;                               /* inputA pointer */
 130   float32_t *pIn2;                               /* inputB pointer */
 131   float32_t *pOut = pDst;                        /* output pointer */
 132   float32_t *px;                                 /* Intermediate inputA pointer */
 133   float32_t *py;                                 /* Intermediate inputB pointer */
 134   float32_t *pSrc1;                              /* Intermediate pointers */
 135   float32_t sum, acc0, acc1, acc2, acc3;         /* Accumulators */
 136   float32_t x0, x1, x2, x3, c0;                  /* temporary variables for holding input and coefficient values */
 137   uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3;  /* loop counters */
 138   int32_t inc = 1;                               /* Destination address modifier */
 139
 140
 141   /* The algorithm implementation is based on the lengths of the inputs. */
 142   /* srcB is always made to slide across srcA. */
 143   /* So srcBLen is always considered as shorter or equal to srcALen */
 144   /* But CORR(x, y) is reverse of CORR(y, x) */
 145   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
 146   /* and the destination pointer modifier, inc is set to -1 */
 147   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
 148   /* But to improve the performance,
 149    * we include zeroes in the output instead of zero padding either of the the inputs*/
 150   /* If srcALen > srcBLen,
 151    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
 152   /* If srcALen < srcBLen,
 153    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
 154   if(srcALen >= srcBLen)
 155   {
 156     /* Initialization of inputA pointer */
 157     pIn1 = pSrcA;
 158
 159     /* Initialization of inputB pointer */
 160     pIn2 = pSrcB;
 161
 162     /* Number of output samples is calculated */
 163     outBlockSize = (2u * srcALen) - 1u;
 164
 165     /* When srcALen > srcBLen, zero padding has to be done to srcB
 166      * to make their lengths equal.
 167      * Instead, (outBlockSize - (srcALen + srcBLen - 1))
 168      * number of output samples are made zero */
 169     j = outBlockSize - (srcALen + (srcBLen - 1u));
 170
 171     /* Updating the pointer position to non zero value */
 172     pOut += j;
 173
 174     //while(j > 0u)
 175     //{
 176     //  /* Zero is stored in the destination buffer */
 177     //  *pOut++ = 0.0f;
 178
 179     //  /* Decrement the loop counter */
 180     //  j--;
 181     //}
 182
 183   }
 184   else
 185   {
 186     /* Initialization of inputA pointer */
 187     pIn1 = pSrcB;
 188
 189     /* Initialization of inputB pointer */
 190     pIn2 = pSrcA;
 191
 192     /* srcBLen is always considered as shorter or equal to srcALen */
 193     j = srcBLen;
 194     srcBLen = srcALen;
 195     srcALen = j;
 196
 197     /* CORR(x, y) = Reverse order(CORR(y, x)) */
 198     /* Hence set the destination pointer to point to the last output sample */
 199     pOut = pDst + ((srcALen + srcBLen) - 2u);
 200
 201     /* Destination address modifier is set to -1 */
 202     inc = -1;
 203
 204   }
 205
 206   /* The function is internally
 207    * divided into three parts according to the number of multiplications that has to be
 208    * taken place between inputA samples and inputB samples. In the first part of the
 209    * algorithm, the multiplications increase by one for every iteration.
 210    * In the second part of the algorithm, srcBLen number of multiplications are done.
 211    * In the third part of the algorithm, the multiplications decrease by one
 212    * for every iteration.*/
 213   /* The algorithm is implemented in three stages.
 214    * The loop counters of each stage is initiated here. */
 215   blockSize1 = srcBLen - 1u;
 216   blockSize2 = srcALen - (srcBLen - 1u);
 217   blockSize3 = blockSize1;
 218
 219   /* --------------------------
 220    * Initializations of stage1
 221    * -------------------------*/
 222
 223   /* sum = x[0] * y[srcBlen - 1]
 224    * sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1]
 225    * ....
 226    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
 227    */
 228
 229   /* In this stage the MAC operations are increased by 1 for every iteration.
 230      The count variable holds the number of MAC operations performed */
 231   count = 1u;
 232
 233   /* Working pointer of inputA */
 234   px = pIn1;
 235
 236   /* Working pointer of inputB */
 237   pSrc1 = pIn2 + (srcBLen - 1u);
 238   py = pSrc1;
 239
 240   /* ------------------------
 241    * Stage1 process
 242    * ----------------------*/
 243
 244   /* The first stage starts here */
 245   while(blockSize1 > 0u)
 246   {
 247     /* Accumulator is made zero for every iteration */
 248     sum = 0.0f;
 249
 250     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 251     k = count >> 2u;
 252
 253     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 254      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 255     while(k > 0u)
 256     {
 257       /* x[0] * y[srcBLen - 4] */
 258       sum += *px++ * *py++;
 259       /* x[1] * y[srcBLen - 3] */
 260       sum += *px++ * *py++;
 261       /* x[2] * y[srcBLen - 2] */
 262       sum += *px++ * *py++;
 263       /* x[3] * y[srcBLen - 1] */
 264       sum += *px++ * *py++;
 265
 266       /* Decrement the loop counter */
 267       k--;
 268     }
 269
 270     /* If the count is not a multiple of 4, compute any remaining MACs here.
 271      ** No loop unrolling is used. */
 272     k = count % 0x4u;
 273
 274     while(k > 0u)
 275     {
 276       /* Perform the multiply-accumulate */
 277       /* x[0] * y[srcBLen - 1] */
 278       sum += *px++ * *py++;
 279
 280       /* Decrement the loop counter */
 281       k--;
 282     }
 283
 284     /* Store the result in the accumulator in the destination buffer. */
 285     *pOut = sum;
 286     /* Destination pointer is updated according to the address modifier, inc */
 287     pOut += inc;
 288
 289     /* Update the inputA and inputB pointers for next MAC calculation */
 290     py = pSrc1 - count;
 291     px = pIn1;
 292
 293     /* Increment the MAC count */
 294     count++;
 295
 296     /* Decrement the loop counter */
 297     blockSize1--;
 298   }
 299
 300   /* --------------------------
 301    * Initializations of stage2
 302    * ------------------------*/
 303
 304   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
 305    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
 306    * ....
 307    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 308    */
 309
 310   /* Working pointer of inputA */
 311   px = pIn1;
 312
 313   /* Working pointer of inputB */
 314   py = pIn2;
 315
 316   /* count is index by which the pointer pIn1 to be incremented */
 317   count = 0u;
 318
 319   /* -------------------
 320    * Stage2 process
 321    * ------------------*/
 322
 323   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
 324    * So, to loop unroll over blockSize2,
 325    * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
 326   if(srcBLen >= 4u)
 327   {
 328     /* Loop unroll over blockSize2, by 4 */
 329     blkCnt = blockSize2 >> 2u;
 330
 331     while(blkCnt > 0u)
 332     {
 333       /* Set all accumulators to zero */
 334       acc0 = 0.0f;
 335       acc1 = 0.0f;
 336       acc2 = 0.0f;
 337       acc3 = 0.0f;
 338
 339       /* read x[0], x[1], x[2] samples */
 340       x0 = *(px++);
 341       x1 = *(px++);
 342       x2 = *(px++);
 343
 344       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 345       k = srcBLen >> 2u;
 346
 347       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 348        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 349       do
 350       {
 351         /* Read y[0] sample */
 352         c0 = *(py++);
 353
 354         /* Read x[3] sample */
 355         x3 = *(px++);
 356
 357         /* Perform the multiply-accumulate */
 358         /* acc0 +=  x[0] * y[0] */
 359         acc0 += x0 * c0;
 360         /* acc1 +=  x[1] * y[0] */
 361         acc1 += x1 * c0;
 362         /* acc2 +=  x[2] * y[0] */
 363         acc2 += x2 * c0;
 364         /* acc3 +=  x[3] * y[0] */
 365         acc3 += x3 * c0;
 366
 367         /* Read y[1] sample */
 368         c0 = *(py++);
 369
 370         /* Read x[4] sample */
 371         x0 = *(px++);
 372
 373         /* Perform the multiply-accumulate */
 374         /* acc0 +=  x[1] * y[1] */
 375         acc0 += x1 * c0;
 376         /* acc1 +=  x[2] * y[1] */
 377         acc1 += x2 * c0;
 378         /* acc2 +=  x[3] * y[1] */
 379         acc2 += x3 * c0;
 380         /* acc3 +=  x[4] * y[1] */
 381         acc3 += x0 * c0;
 382
 383         /* Read y[2] sample */
 384         c0 = *(py++);
 385
 386         /* Read x[5] sample */
 387         x1 = *(px++);
 388
 389         /* Perform the multiply-accumulates */
 390         /* acc0 +=  x[2] * y[2] */
 391         acc0 += x2 * c0;
 392         /* acc1 +=  x[3] * y[2] */
 393         acc1 += x3 * c0;
 394         /* acc2 +=  x[4] * y[2] */
 395         acc2 += x0 * c0;
 396         /* acc3 +=  x[5] * y[2] */
 397         acc3 += x1 * c0;
 398
 399         /* Read y[3] sample */
 400         c0 = *(py++);
 401
 402         /* Read x[6] sample */
 403         x2 = *(px++);
 404
 405         /* Perform the multiply-accumulates */
 406         /* acc0 +=  x[3] * y[3] */
 407         acc0 += x3 * c0;
 408         /* acc1 +=  x[4] * y[3] */
 409         acc1 += x0 * c0;
 410         /* acc2 +=  x[5] * y[3] */
 411         acc2 += x1 * c0;
 412         /* acc3 +=  x[6] * y[3] */
 413         acc3 += x2 * c0;
 414
 415
 416       } while(--k);
 417
 418       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 419        ** No loop unrolling is used. */
 420       k = srcBLen % 0x4u;
 421
 422       while(k > 0u)
 423       {
 424         /* Read y[4] sample */
 425         c0 = *(py++);
 426
 427         /* Read x[7] sample */
 428         x3 = *(px++);
 429
 430         /* Perform the multiply-accumulates */
 431         /* acc0 +=  x[4] * y[4] */
 432         acc0 += x0 * c0;
 433         /* acc1 +=  x[5] * y[4] */
 434         acc1 += x1 * c0;
 435         /* acc2 +=  x[6] * y[4] */
 436         acc2 += x2 * c0;
 437         /* acc3 +=  x[7] * y[4] */
 438         acc3 += x3 * c0;
 439
 440         /* Reuse the present samples for the next MAC */
 441         x0 = x1;
 442         x1 = x2;
 443         x2 = x3;
 444
 445         /* Decrement the loop counter */
 446         k--;
 447       }
 448
 449       /* Store the result in the accumulator in the destination buffer. */
 450       *pOut = acc0;
 451       /* Destination pointer is updated according to the address modifier, inc */
 452       pOut += inc;
 453
 454       *pOut = acc1;
 455       pOut += inc;
 456
 457       *pOut = acc2;
 458       pOut += inc;
 459
 460       *pOut = acc3;
 461       pOut += inc;
 462
 463       /* Increment the pointer pIn1 index, count by 4 */
 464       count += 4u;
 465
 466       /* Update the inputA and inputB pointers for next MAC calculation */
 467       px = pIn1 + count;
 468       py = pIn2;
 469
 470       /* Decrement the loop counter */
 471       blkCnt--;
 472     }
 473
 474     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
 475      ** No loop unrolling is used. */
 476     blkCnt = blockSize2 % 0x4u;
 477
 478     while(blkCnt > 0u)
 479     {
 480       /* Accumulator is made zero for every iteration */
 481       sum = 0.0f;
 482
 483       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 484       k = srcBLen >> 2u;
 485
 486       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 487        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 488       while(k > 0u)
 489       {
 490         /* Perform the multiply-accumulates */
 491         sum += *px++ * *py++;
 492         sum += *px++ * *py++;
 493         sum += *px++ * *py++;
 494         sum += *px++ * *py++;
 495
 496         /* Decrement the loop counter */
 497         k--;
 498       }
 499
 500       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 501        ** No loop unrolling is used. */
 502       k = srcBLen % 0x4u;
 503
 504       while(k > 0u)
 505       {
 506         /* Perform the multiply-accumulate */
 507         sum += *px++ * *py++;
 508
 509         /* Decrement the loop counter */
 510         k--;
 511       }
 512
 513       /* Store the result in the accumulator in the destination buffer. */
 514       *pOut = sum;
 515       /* Destination pointer is updated according to the address modifier, inc */
 516       pOut += inc;
 517
 518       /* Increment the pointer pIn1 index, count by 1 */
 519       count++;
 520
 521       /* Update the inputA and inputB pointers for next MAC calculation */
 522       px = pIn1 + count;
 523       py = pIn2;
 524
 525       /* Decrement the loop counter */
 526       blkCnt--;
 527     }
 528   }
 529   else
 530   {
 531     /* If the srcBLen is not a multiple of 4,
 532      * the blockSize2 loop cannot be unrolled by 4 */
 533     blkCnt = blockSize2;
 534
 535     while(blkCnt > 0u)
 536     {
 537       /* Accumulator is made zero for every iteration */
 538       sum = 0.0f;
 539
 540       /* Loop over srcBLen */
 541       k = srcBLen;
 542
 543       while(k > 0u)
 544       {
 545         /* Perform the multiply-accumulate */
 546         sum += *px++ * *py++;
 547
 548         /* Decrement the loop counter */
 549         k--;
 550       }
 551
 552       /* Store the result in the accumulator in the destination buffer. */
 553       *pOut = sum;
 554       /* Destination pointer is updated according to the address modifier, inc */
 555       pOut += inc;
 556
 557       /* Increment the pointer pIn1 index, count by 1 */
 558       count++;
 559
 560       /* Update the inputA and inputB pointers for next MAC calculation */
 561       px = pIn1 + count;
 562       py = pIn2;
 563
 564       /* Decrement the loop counter */
 565       blkCnt--;
 566     }
 567   }
 568
 569   /* --------------------------
 570    * Initializations of stage3
 571    * -------------------------*/
 572
 573   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 574    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 575    * ....
 576    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
 577    * sum +=  x[srcALen-1] * y[0]
 578    */
 579
 580   /* In this stage the MAC operations are decreased by 1 for every iteration.
 581      The count variable holds the number of MAC operations performed */
 582   count = srcBLen - 1u;
 583
 584   /* Working pointer of inputA */
 585   pSrc1 = pIn1 + (srcALen - (srcBLen - 1u));
 586   px = pSrc1;
 587
 588   /* Working pointer of inputB */
 589   py = pIn2;
 590
 591   /* -------------------
 592    * Stage3 process
 593    * ------------------*/
 594
 595   while(blockSize3 > 0u)
 596   {
 597     /* Accumulator is made zero for every iteration */
 598     sum = 0.0f;
 599
 600     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 601     k = count >> 2u;
 602
 603     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 604      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 605     while(k > 0u)
 606     {
 607       /* Perform the multiply-accumulates */
 608       /* sum += x[srcALen - srcBLen + 4] * y[3] */
 609       sum += *px++ * *py++;
 610       /* sum += x[srcALen - srcBLen + 3] * y[2] */
 611       sum += *px++ * *py++;
 612       /* sum += x[srcALen - srcBLen + 2] * y[1] */
 613       sum += *px++ * *py++;
 614       /* sum += x[srcALen - srcBLen + 1] * y[0] */
 615       sum += *px++ * *py++;
 616
 617       /* Decrement the loop counter */
 618       k--;
 619     }
 620
 621     /* If the count is not a multiple of 4, compute any remaining MACs here.
 622      ** No loop unrolling is used. */
 623     k = count % 0x4u;
 624
 625     while(k > 0u)
 626     {
 627       /* Perform the multiply-accumulates */
 628       sum += *px++ * *py++;
 629
 630       /* Decrement the loop counter */
 631       k--;
 632     }
 633
 634     /* Store the result in the accumulator in the destination buffer. */
 635     *pOut = sum;
 636     /* Destination pointer is updated according to the address modifier, inc */
 637     pOut += inc;
 638
 639     /* Update the inputA and inputB pointers for next MAC calculation */
 640     px = ++pSrc1;
 641     py = pIn2;
 642
 643     /* Decrement the MAC count */
 644     count--;
 645
 646     /* Decrement the loop counter */
 647     blockSize3--;
 648   }
 649
 650 #else
 651
 652   /* Run the below code for Cortex-M0 */
 653
 654   float32_t *pIn1 = pSrcA;                       /* inputA pointer */
 655   float32_t *pIn2 = pSrcB + (srcBLen - 1u);      /* inputB pointer */
 656   float32_t sum;                                 /* Accumulator */
 657   uint32_t i = 0u, j;                            /* loop counters */
 658   uint32_t inv = 0u;                             /* Reverse order flag */
 659   uint32_t tot = 0u;                             /* Length */
 660
 661   /* The algorithm implementation is based on the lengths of the inputs. */
 662   /* srcB is always made to slide across srcA. */
 663   /* So srcBLen is always considered as shorter or equal to srcALen */
 664   /* But CORR(x, y) is reverse of CORR(y, x) */
 665   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
 666   /* and a varaible, inv is set to 1 */
 667   /* If lengths are not equal then zero pad has to be done to  make the two
 668    * inputs of same length. But to improve the performance, we include zeroes
 669    * in the output instead of zero padding either of the the inputs*/
 670   /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
 671    * starting of the output buffer */
 672   /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
 673    * ending of the output buffer */
 674   /* Once the zero padding is done the remaining of the output is calcualted
 675    * using convolution but with the shorter signal time shifted. */
 676
 677   /* Calculate the length of the remaining sequence */
 678   tot = ((srcALen + srcBLen) - 2u);
 679
 680   if(srcALen > srcBLen)
 681   {
 682     /* Calculating the number of zeros to be padded to the output */
 683     j = srcALen - srcBLen;
 684
 685     /* Initialise the pointer after zero padding */
 686     pDst += j;
 687   }
 688
 689   else if(srcALen < srcBLen)
 690   {
 691     /* Initialization to inputB pointer */
 692     pIn1 = pSrcB;
 693
 694     /* Initialization to the end of inputA pointer */
 695     pIn2 = pSrcA + (srcALen - 1u);
 696
 697     /* Initialisation of the pointer after zero padding */
 698     pDst = pDst + tot;
 699
 700     /* Swapping the lengths */
 701     j = srcALen;
 702     srcALen = srcBLen;
 703     srcBLen = j;
 704
 705     /* Setting the reverse flag */
 706     inv = 1;
 707
 708   }
 709
 710   /* Loop to calculate convolution for output length number of times */
 711   for (i = 0u; i <= tot; i++)
 712   {
 713     /* Initialize sum with zero to carry on MAC operations */
 714     sum = 0.0f;
 715
 716     /* Loop to perform MAC operations according to convolution equation */
 717     for (j = 0u; j <= i; j++)
 718     {
 719       /* Check the array limitations */
 720       if((((i - j) < srcBLen) && (j < srcALen)))
 721       {
 722         /* z[i] += x[i-j] * y[j] */
 723         sum += pIn1[j] * pIn2[-((int32_t) i - j)];
 724       }
 725     }
 726     /* Store the output in the destination buffer */
 727     if(inv == 1)
 728       *pDst-- = sum;
 729     else
 730       *pDst++ = sum;
 731   }
 732
 733 #endif /*   #ifndef ARM_MATH_CM0_FAMILY */
 734
 735 }
 736
 737 /**
 738  * @} end of Corr group
 739  */