1 /* ----------------------------------------------------------------------------
2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
4 * $Date: 17. January 2013
7 * Project: CMSIS DSP Library
8 * Title: arm_conv_f32.c
10 * Description: Convolution of floating-point sequences.
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------------- */
44 * @ingroup groupFilters
48 * @defgroup Conv Convolution
50 * Convolution is a mathematical operation that operates on two finite length vectors to generate a finite length output vector.
51 * Convolution is similar to correlation and is frequently used in filtering and data analysis.
52 * The CMSIS DSP library contains functions for convolving Q7, Q15, Q31, and floating-point data types.
53 * The library also provides fast versions of the Q15 and Q31 functions on Cortex-M4 and Cortex-M3.
56 * Let <code>a[n]</code> and <code>b[n]</code> be sequences of length <code>srcALen</code> and <code>srcBLen</code> samples respectively.
57 * Then the convolution
65 * \image html ConvolutionEquation.gif
67 * Note that <code>c[n]</code> is of length <code>srcALen + srcBLen - 1</code> and is defined over the interval <code>n=0, 1, 2, ..., srcALen + srcBLen - 2</code>.
68 * <code>pSrcA</code> points to the first input vector of length <code>srcALen</code> and
69 * <code>pSrcB</code> points to the second input vector of length <code>srcBLen</code>.
70 * The output result is written to <code>pDst</code> and the calling function must allocate <code>srcALen+srcBLen-1</code> words for the result.
73 * Conceptually, when two signals <code>a[n]</code> and <code>b[n]</code> are convolved,
74 * the signal <code>b[n]</code> slides over <code>a[n]</code>.
75 * For each offset \c n, the overlapping portions of a[n] and b[n] are multiplied and summed together.
78 * Note that convolution is a commutative operation:
81 * a[n] * b[n] = b[n] * a[n].
85 * This means that switching the A and B arguments to the convolution functions has no effect.
87 * <b>Fixed-Point Behavior</b>
90 * Convolution requires summing up a large number of intermediate products.
91 * As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
92 * Refer to the function specific documentation below for further details of the particular algorithm used.
95 * <b>Fast Versions</b>
98 * Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of conv and the design requires
99 * the input signals should be scaled down to avoid intermediate overflows.
102 * <b>Opt Versions</b>
105 * Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation.
106 * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions
115 * @brief Convolution of floating-point sequences.
116 * @param[in] *pSrcA points to the first input sequence.
117 * @param[in] srcALen length of the first input sequence.
118 * @param[in] *pSrcB points to the second input sequence.
119 * @param[in] srcBLen length of the second input sequence.
120 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
133 #ifndef ARM_MATH_CM0_FAMILY
135 /* Run the below code for Cortex-M4 and Cortex-M3 */
137 float32_t *pIn1; /* inputA pointer */
138 float32_t *pIn2; /* inputB pointer */
139 float32_t *pOut = pDst; /* output pointer */
140 float32_t *px; /* Intermediate inputA pointer */
141 float32_t *py; /* Intermediate inputB pointer */
142 float32_t *pSrc1, *pSrc2; /* Intermediate pointers */
143 float32_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
144 float32_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
145 uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counters */
147 /* The algorithm implementation is based on the lengths of the inputs. */
148 /* srcB is always made to slide across srcA. */
149 /* So srcBLen is always considered as shorter or equal to srcALen */
150 if(srcALen >= srcBLen)
152 /* Initialization of inputA pointer */
155 /* Initialization of inputB pointer */
160 /* Initialization of inputA pointer */
163 /* Initialization of inputB pointer */
166 /* srcBLen is always considered as shorter or equal to srcALen */
172 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
173 /* The function is internally
174 * divided into three stages according to the number of multiplications that has to be
175 * taken place between inputA samples and inputB samples. In the first stage of the
176 * algorithm, the multiplications increase by one for every iteration.
177 * In the second stage of the algorithm, srcBLen number of multiplications are done.
178 * In the third stage of the algorithm, the multiplications decrease by one
179 * for every iteration. */
181 /* The algorithm is implemented in three stages.
182 The loop counters of each stage is initiated here. */
183 blockSize1 = srcBLen - 1u;
184 blockSize2 = srcALen - (srcBLen - 1u);
185 blockSize3 = blockSize1;
187 /* --------------------------
188 * initializations of stage1
189 * -------------------------*/
192 * sum = x[0] * y[1] + x[1] * y[0]
194 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
197 /* In this stage the MAC operations are increased by 1 for every iteration.
198 The count variable holds the number of MAC operations performed */
201 /* Working pointer of inputA */
204 /* Working pointer of inputB */
208 /* ------------------------
210 * ----------------------*/
212 /* The first stage starts here */
213 while(blockSize1 > 0u)
215 /* Accumulator is made zero for every iteration */
218 /* Apply loop unrolling and compute 4 MACs simultaneously. */
221 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
222 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
225 /* x[0] * y[srcBLen - 1] */
226 sum += *px++ * *py--;
228 /* x[1] * y[srcBLen - 2] */
229 sum += *px++ * *py--;
231 /* x[2] * y[srcBLen - 3] */
232 sum += *px++ * *py--;
234 /* x[3] * y[srcBLen - 4] */
235 sum += *px++ * *py--;
237 /* Decrement the loop counter */
241 /* If the count is not a multiple of 4, compute any remaining MACs here.
242 ** No loop unrolling is used. */
247 /* Perform the multiply-accumulate */
248 sum += *px++ * *py--;
250 /* Decrement the loop counter */
254 /* Store the result in the accumulator in the destination buffer. */
257 /* Update the inputA and inputB pointers for next MAC calculation */
261 /* Increment the MAC count */
264 /* Decrement the loop counter */
268 /* --------------------------
269 * Initializations of stage2
270 * ------------------------*/
272 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
273 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
275 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
278 /* Working pointer of inputA */
281 /* Working pointer of inputB */
282 pSrc2 = pIn2 + (srcBLen - 1u);
285 /* count is index by which the pointer pIn1 to be incremented */
288 /* -------------------
290 * ------------------*/
292 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
293 * So, to loop unroll over blockSize2,
294 * srcBLen should be greater than or equal to 4 */
297 /* Loop unroll over blockSize2, by 4 */
298 blkCnt = blockSize2 >> 2u;
302 /* Set all accumulators to zero */
308 /* read x[0], x[1], x[2] samples */
313 /* Apply loop unrolling and compute 4 MACs simultaneously. */
316 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
317 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
320 /* Read y[srcBLen - 1] sample */
323 /* Read x[3] sample */
326 /* Perform the multiply-accumulate */
327 /* acc0 += x[0] * y[srcBLen - 1] */
330 /* acc1 += x[1] * y[srcBLen - 1] */
333 /* acc2 += x[2] * y[srcBLen - 1] */
336 /* acc3 += x[3] * y[srcBLen - 1] */
339 /* Read y[srcBLen - 2] sample */
342 /* Read x[4] sample */
345 /* Perform the multiply-accumulate */
346 /* acc0 += x[1] * y[srcBLen - 2] */
348 /* acc1 += x[2] * y[srcBLen - 2] */
350 /* acc2 += x[3] * y[srcBLen - 2] */
352 /* acc3 += x[4] * y[srcBLen - 2] */
355 /* Read y[srcBLen - 3] sample */
358 /* Read x[5] sample */
361 /* Perform the multiply-accumulates */
362 /* acc0 += x[2] * y[srcBLen - 3] */
364 /* acc1 += x[3] * y[srcBLen - 2] */
366 /* acc2 += x[4] * y[srcBLen - 2] */
368 /* acc3 += x[5] * y[srcBLen - 2] */
371 /* Read y[srcBLen - 4] sample */
374 /* Read x[6] sample */
378 /* Perform the multiply-accumulates */
379 /* acc0 += x[3] * y[srcBLen - 4] */
381 /* acc1 += x[4] * y[srcBLen - 4] */
383 /* acc2 += x[5] * y[srcBLen - 4] */
385 /* acc3 += x[6] * y[srcBLen - 4] */
391 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
392 ** No loop unrolling is used. */
397 /* Read y[srcBLen - 5] sample */
400 /* Read x[7] sample */
403 /* Perform the multiply-accumulates */
404 /* acc0 += x[4] * y[srcBLen - 5] */
406 /* acc1 += x[5] * y[srcBLen - 5] */
408 /* acc2 += x[6] * y[srcBLen - 5] */
410 /* acc3 += x[7] * y[srcBLen - 5] */
413 /* Reuse the present samples for the next MAC */
418 /* Decrement the loop counter */
422 /* Store the result in the accumulator in the destination buffer. */
428 /* Increment the pointer pIn1 index, count by 4 */
431 /* Update the inputA and inputB pointers for next MAC calculation */
436 /* Decrement the loop counter */
441 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
442 ** No loop unrolling is used. */
443 blkCnt = blockSize2 % 0x4u;
447 /* Accumulator is made zero for every iteration */
450 /* Apply loop unrolling and compute 4 MACs simultaneously. */
453 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
454 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
457 /* Perform the multiply-accumulates */
458 sum += *px++ * *py--;
459 sum += *px++ * *py--;
460 sum += *px++ * *py--;
461 sum += *px++ * *py--;
463 /* Decrement the loop counter */
467 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
468 ** No loop unrolling is used. */
473 /* Perform the multiply-accumulate */
474 sum += *px++ * *py--;
476 /* Decrement the loop counter */
480 /* Store the result in the accumulator in the destination buffer. */
483 /* Increment the MAC count */
486 /* Update the inputA and inputB pointers for next MAC calculation */
490 /* Decrement the loop counter */
496 /* If the srcBLen is not a multiple of 4,
497 * the blockSize2 loop cannot be unrolled by 4 */
502 /* Accumulator is made zero for every iteration */
505 /* srcBLen number of MACS should be performed */
510 /* Perform the multiply-accumulate */
511 sum += *px++ * *py--;
513 /* Decrement the loop counter */
517 /* Store the result in the accumulator in the destination buffer. */
520 /* Increment the MAC count */
523 /* Update the inputA and inputB pointers for next MAC calculation */
527 /* Decrement the loop counter */
533 /* --------------------------
534 * Initializations of stage3
535 * -------------------------*/
537 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
538 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
540 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
541 * sum += x[srcALen-1] * y[srcBLen-1]
544 /* In this stage the MAC operations are decreased by 1 for every iteration.
545 The blockSize3 variable holds the number of MAC operations performed */
547 /* Working pointer of inputA */
548 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
551 /* Working pointer of inputB */
552 pSrc2 = pIn2 + (srcBLen - 1u);
555 /* -------------------
557 * ------------------*/
559 while(blockSize3 > 0u)
561 /* Accumulator is made zero for every iteration */
564 /* Apply loop unrolling and compute 4 MACs simultaneously. */
565 k = blockSize3 >> 2u;
567 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
568 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
571 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
572 sum += *px++ * *py--;
574 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
575 sum += *px++ * *py--;
577 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
578 sum += *px++ * *py--;
580 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
581 sum += *px++ * *py--;
583 /* Decrement the loop counter */
587 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
588 ** No loop unrolling is used. */
589 k = blockSize3 % 0x4u;
593 /* Perform the multiply-accumulates */
594 /* sum += x[srcALen-1] * y[srcBLen-1] */
595 sum += *px++ * *py--;
597 /* Decrement the loop counter */
601 /* Store the result in the accumulator in the destination buffer. */
604 /* Update the inputA and inputB pointers for next MAC calculation */
608 /* Decrement the loop counter */
614 /* Run the below code for Cortex-M0 */
616 float32_t *pIn1 = pSrcA; /* inputA pointer */
617 float32_t *pIn2 = pSrcB; /* inputB pointer */
618 float32_t sum; /* Accumulator */
619 uint32_t i, j; /* loop counters */
621 /* Loop to calculate convolution for output length number of times */
622 for (i = 0u; i < ((srcALen + srcBLen) - 1u); i++)
624 /* Initialize sum with zero to carry out MAC operations */
627 /* Loop to perform MAC operations according to convolution equation */
628 for (j = 0u; j <= i; j++)
630 /* Check the array limitations */
631 if((((i - j) < srcBLen) && (j < srcALen)))
633 /* z[i] += x[i-j] * y[j] */
634 sum += pIn1[j] * pIn2[i - j];
637 /* Store the output in the destination buffer */
641 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
646 * @} end of Conv group