1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
4 * $Date: 17. January 2013
7 * Project: CMSIS DSP Library
10 * Description: Convolution of Q7 sequences.
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
44 * @ingroup groupFilters
53 * @brief Convolution of Q7 sequences.
54 * @param[in] *pSrcA points to the first input sequence.
55 * @param[in] srcALen length of the first input sequence.
56 * @param[in] *pSrcB points to the second input sequence.
57 * @param[in] srcBLen length of the second input sequence.
58 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
62 * <b>Scaling and Overflow Behavior:</b>
65 * The function is implemented using a 32-bit internal accumulator.
66 * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
67 * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
68 * This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.
69 * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
72 * Refer the function <code>arm_conv_opt_q7()</code> for a faster implementation of this function.
85 #ifndef ARM_MATH_CM0_FAMILY
87 /* Run the below code for Cortex-M4 and Cortex-M3 */
89 q7_t *pIn1; /* inputA pointer */
90 q7_t *pIn2; /* inputB pointer */
91 q7_t *pOut = pDst; /* output pointer */
92 q7_t *px; /* Intermediate inputA pointer */
93 q7_t *py; /* Intermediate inputB pointer */
94 q7_t *pSrc1, *pSrc2; /* Intermediate pointers */
95 q7_t x0, x1, x2, x3, c0, c1; /* Temporary variables to hold state and coefficient values */
96 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
97 q31_t input1, input2; /* Temporary input variables */
98 q15_t in1, in2; /* Temporary input variables */
99 uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */
101 /* The algorithm implementation is based on the lengths of the inputs. */
102 /* srcB is always made to slide across srcA. */
103 /* So srcBLen is always considered as shorter or equal to srcALen */
104 if(srcALen >= srcBLen)
106 /* Initialization of inputA pointer */
109 /* Initialization of inputB pointer */
114 /* Initialization of inputA pointer */
117 /* Initialization of inputB pointer */
120 /* srcBLen is always considered as shorter or equal to srcALen */
126 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
127 /* The function is internally
128 * divided into three stages according to the number of multiplications that has to be
129 * taken place between inputA samples and inputB samples. In the first stage of the
130 * algorithm, the multiplications increase by one for every iteration.
131 * In the second stage of the algorithm, srcBLen number of multiplications are done.
132 * In the third stage of the algorithm, the multiplications decrease by one
133 * for every iteration. */
135 /* The algorithm is implemented in three stages.
136 The loop counters of each stage is initiated here. */
137 blockSize1 = srcBLen - 1u;
138 blockSize2 = (srcALen - srcBLen) + 1u;
139 blockSize3 = blockSize1;
141 /* --------------------------
142 * Initializations of stage1
143 * -------------------------*/
146 * sum = x[0] * y[1] + x[1] * y[0]
148 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
151 /* In this stage the MAC operations are increased by 1 for every iteration.
152 The count variable holds the number of MAC operations performed */
155 /* Working pointer of inputA */
158 /* Working pointer of inputB */
162 /* ------------------------
164 * ----------------------*/
166 /* The first stage starts here */
167 while(blockSize1 > 0u)
169 /* Accumulator is made zero for every iteration */
172 /* Apply loop unrolling and compute 4 MACs simultaneously. */
175 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
176 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
180 in1 = (q15_t) * px++;
181 in2 = (q15_t) * px++;
182 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
184 /* y[srcBLen - 1] , y[srcBLen - 2] */
185 in1 = (q15_t) * py--;
186 in2 = (q15_t) * py--;
187 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
189 /* x[0] * y[srcBLen - 1] */
190 /* x[1] * y[srcBLen - 2] */
191 sum = __SMLAD(input1, input2, sum);
194 in1 = (q15_t) * px++;
195 in2 = (q15_t) * px++;
196 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
198 /* y[srcBLen - 3] , y[srcBLen - 4] */
199 in1 = (q15_t) * py--;
200 in2 = (q15_t) * py--;
201 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
203 /* x[2] * y[srcBLen - 3] */
204 /* x[3] * y[srcBLen - 4] */
205 sum = __SMLAD(input1, input2, sum);
207 /* Decrement the loop counter */
211 /* If the count is not a multiple of 4, compute any remaining MACs here.
212 ** No loop unrolling is used. */
217 /* Perform the multiply-accumulates */
218 sum += ((q15_t) * px++ * *py--);
220 /* Decrement the loop counter */
224 /* Store the result in the accumulator in the destination buffer. */
225 *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8));
227 /* Update the inputA and inputB pointers for next MAC calculation */
231 /* Increment the MAC count */
234 /* Decrement the loop counter */
238 /* --------------------------
239 * Initializations of stage2
240 * ------------------------*/
242 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
243 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
245 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
248 /* Working pointer of inputA */
251 /* Working pointer of inputB */
252 pSrc2 = pIn2 + (srcBLen - 1u);
255 /* count is index by which the pointer pIn1 to be incremented */
258 /* -------------------
260 * ------------------*/
262 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
263 * So, to loop unroll over blockSize2,
264 * srcBLen should be greater than or equal to 4 */
267 /* Loop unroll over blockSize2, by 4 */
268 blkCnt = blockSize2 >> 2u;
272 /* Set all accumulators to zero */
278 /* read x[0], x[1], x[2] samples */
283 /* Apply loop unrolling and compute 4 MACs simultaneously. */
286 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
287 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
290 /* Read y[srcBLen - 1] sample */
292 /* Read y[srcBLen - 2] sample */
295 /* Read x[3] sample */
298 /* x[0] and x[1] are packed */
302 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
304 /* y[srcBLen - 1] and y[srcBLen - 2] are packed */
308 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
310 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
311 acc0 = __SMLAD(input1, input2, acc0);
313 /* x[1] and x[2] are packed */
317 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
319 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
320 acc1 = __SMLAD(input1, input2, acc1);
322 /* x[2] and x[3] are packed */
326 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
328 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
329 acc2 = __SMLAD(input1, input2, acc2);
331 /* Read x[4] sample */
334 /* x[3] and x[4] are packed */
338 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
340 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
341 acc3 = __SMLAD(input1, input2, acc3);
343 /* Read y[srcBLen - 3] sample */
345 /* Read y[srcBLen - 4] sample */
348 /* Read x[5] sample */
351 /* x[2] and x[3] are packed */
355 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
357 /* y[srcBLen - 3] and y[srcBLen - 4] are packed */
361 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
363 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
364 acc0 = __SMLAD(input1, input2, acc0);
366 /* x[3] and x[4] are packed */
370 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
372 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
373 acc1 = __SMLAD(input1, input2, acc1);
375 /* x[4] and x[5] are packed */
379 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
381 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
382 acc2 = __SMLAD(input1, input2, acc2);
384 /* Read x[6] sample */
387 /* x[5] and x[6] are packed */
391 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
393 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
394 acc3 = __SMLAD(input1, input2, acc3);
398 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
399 ** No loop unrolling is used. */
404 /* Read y[srcBLen - 5] sample */
407 /* Read x[7] sample */
410 /* Perform the multiply-accumulates */
411 /* acc0 += x[4] * y[srcBLen - 5] */
412 acc0 += ((q15_t) x0 * c0);
413 /* acc1 += x[5] * y[srcBLen - 5] */
414 acc1 += ((q15_t) x1 * c0);
415 /* acc2 += x[6] * y[srcBLen - 5] */
416 acc2 += ((q15_t) x2 * c0);
417 /* acc3 += x[7] * y[srcBLen - 5] */
418 acc3 += ((q15_t) x3 * c0);
420 /* Reuse the present samples for the next MAC */
425 /* Decrement the loop counter */
430 /* Store the result in the accumulator in the destination buffer. */
431 *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8));
432 *pOut++ = (q7_t) (__SSAT(acc1 >> 7u, 8));
433 *pOut++ = (q7_t) (__SSAT(acc2 >> 7u, 8));
434 *pOut++ = (q7_t) (__SSAT(acc3 >> 7u, 8));
436 /* Increment the pointer pIn1 index, count by 4 */
439 /* Update the inputA and inputB pointers for next MAC calculation */
443 /* Decrement the loop counter */
447 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
448 ** No loop unrolling is used. */
449 blkCnt = blockSize2 % 0x4u;
453 /* Accumulator is made zero for every iteration */
456 /* Apply loop unrolling and compute 4 MACs simultaneously. */
459 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
460 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
464 /* Reading two inputs of SrcA buffer and packing */
465 in1 = (q15_t) * px++;
466 in2 = (q15_t) * px++;
467 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
469 /* Reading two inputs of SrcB buffer and packing */
470 in1 = (q15_t) * py--;
471 in2 = (q15_t) * py--;
472 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
474 /* Perform the multiply-accumulates */
475 sum = __SMLAD(input1, input2, sum);
477 /* Reading two inputs of SrcA buffer and packing */
478 in1 = (q15_t) * px++;
479 in2 = (q15_t) * px++;
480 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
482 /* Reading two inputs of SrcB buffer and packing */
483 in1 = (q15_t) * py--;
484 in2 = (q15_t) * py--;
485 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
487 /* Perform the multiply-accumulates */
488 sum = __SMLAD(input1, input2, sum);
490 /* Decrement the loop counter */
494 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
495 ** No loop unrolling is used. */
500 /* Perform the multiply-accumulates */
501 sum += ((q15_t) * px++ * *py--);
503 /* Decrement the loop counter */
507 /* Store the result in the accumulator in the destination buffer. */
508 *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8));
510 /* Increment the pointer pIn1 index, count by 1 */
513 /* Update the inputA and inputB pointers for next MAC calculation */
517 /* Decrement the loop counter */
523 /* If the srcBLen is not a multiple of 4,
524 * the blockSize2 loop cannot be unrolled by 4 */
529 /* Accumulator is made zero for every iteration */
532 /* srcBLen number of MACS should be performed */
537 /* Perform the multiply-accumulate */
538 sum += ((q15_t) * px++ * *py--);
540 /* Decrement the loop counter */
544 /* Store the result in the accumulator in the destination buffer. */
545 *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8));
547 /* Increment the MAC count */
550 /* Update the inputA and inputB pointers for next MAC calculation */
554 /* Decrement the loop counter */
560 /* --------------------------
561 * Initializations of stage3
562 * -------------------------*/
564 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
565 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
567 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
568 * sum += x[srcALen-1] * y[srcBLen-1]
571 /* In this stage the MAC operations are decreased by 1 for every iteration.
572 The blockSize3 variable holds the number of MAC operations performed */
574 /* Working pointer of inputA */
575 pSrc1 = pIn1 + (srcALen - (srcBLen - 1u));
578 /* Working pointer of inputB */
579 pSrc2 = pIn2 + (srcBLen - 1u);
582 /* -------------------
584 * ------------------*/
586 while(blockSize3 > 0u)
588 /* Accumulator is made zero for every iteration */
591 /* Apply loop unrolling and compute 4 MACs simultaneously. */
592 k = blockSize3 >> 2u;
594 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
595 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
598 /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */
599 in1 = (q15_t) * px++;
600 in2 = (q15_t) * px++;
601 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
603 /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */
604 in1 = (q15_t) * py--;
605 in2 = (q15_t) * py--;
606 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
608 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
609 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
610 sum = __SMLAD(input1, input2, sum);
612 /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */
613 in1 = (q15_t) * px++;
614 in2 = (q15_t) * px++;
615 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
617 /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */
618 in1 = (q15_t) * py--;
619 in2 = (q15_t) * py--;
620 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
622 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
623 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
624 sum = __SMLAD(input1, input2, sum);
626 /* Decrement the loop counter */
630 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
631 ** No loop unrolling is used. */
632 k = blockSize3 % 0x4u;
636 /* Perform the multiply-accumulates */
637 sum += ((q15_t) * px++ * *py--);
639 /* Decrement the loop counter */
643 /* Store the result in the accumulator in the destination buffer. */
644 *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8));
646 /* Update the inputA and inputB pointers for next MAC calculation */
650 /* Decrement the loop counter */
656 /* Run the below code for Cortex-M0 */
658 q7_t *pIn1 = pSrcA; /* input pointer */
659 q7_t *pIn2 = pSrcB; /* coefficient pointer */
660 q31_t sum; /* Accumulator */
661 uint32_t i, j; /* loop counter */
663 /* Loop to calculate output of convolution for output length number of times */
664 for (i = 0; i < (srcALen + srcBLen - 1); i++)
666 /* Initialize sum with zero to carry on MAC operations */
669 /* Loop to perform MAC operations according to convolution equation */
670 for (j = 0; j <= i; j++)
672 /* Check the array limitations */
673 if(((i - j) < srcBLen) && (j < srcALen))
675 /* z[i] += x[i-j] * y[j] */
676 sum += (q15_t) pIn1[j] * (pIn2[i - j]);
680 /* Store the output in the destination buffer */
681 pDst[i] = (q7_t) __SSAT((sum >> 7u), 8u);
684 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
689 * @} end of Conv group