1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
4 * $Date: 17. January 2013
7 * Project: CMSIS DSP Library
8 * Title: arm_correlate_fast_q15.c
10 * Description: Fast Q15 Correlation.
12 * Target Processor: Cortex-M4/Cortex-M3
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
44 * @ingroup groupFilters
53 * @brief Correlation of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
54 * @param[in] *pSrcA points to the first input sequence.
55 * @param[in] srcALen length of the first input sequence.
56 * @param[in] *pSrcB points to the second input sequence.
57 * @param[in] srcBLen length of the second input sequence.
58 * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
61 * <b>Scaling and Overflow Behavior:</b>
64 * This fast version uses a 32-bit accumulator with 2.30 format.
65 * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
66 * There is no saturation on intermediate additions.
67 * Thus, if the accumulator overflows it wraps around and distorts the result.
68 * The input signals should be scaled down to avoid intermediate overflows.
69 * Scale down one of the inputs by 1/min(srcALen, srcBLen) to avoid overflow since a
70 * maximum of min(srcALen, srcBLen) number of additions is carried internally.
71 * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
74 * See <code>arm_correlate_q15()</code> for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
77 void arm_correlate_fast_q15(
84 #ifndef UNALIGNED_SUPPORT_DISABLE
86 q15_t *pIn1; /* inputA pointer */
87 q15_t *pIn2; /* inputB pointer */
88 q15_t *pOut = pDst; /* output pointer */
89 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
90 q15_t *px; /* Intermediate inputA pointer */
91 q15_t *py; /* Intermediate inputB pointer */
92 q15_t *pSrc1; /* Intermediate pointers */
93 q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */
94 uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */
95 int32_t inc = 1; /* Destination address modifier */
98 /* The algorithm implementation is based on the lengths of the inputs. */
99 /* srcB is always made to slide across srcA. */
100 /* So srcBLen is always considered as shorter or equal to srcALen */
101 /* But CORR(x, y) is reverse of CORR(y, x) */
102 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
103 /* and the destination pointer modifier, inc is set to -1 */
104 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
105 /* But to improve the performance,
106 * we include zeroes in the output instead of zero padding either of the the inputs*/
107 /* If srcALen > srcBLen,
108 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
109 /* If srcALen < srcBLen,
110 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
111 if(srcALen >= srcBLen)
113 /* Initialization of inputA pointer */
116 /* Initialization of inputB pointer */
119 /* Number of output samples is calculated */
120 outBlockSize = (2u * srcALen) - 1u;
122 /* When srcALen > srcBLen, zero padding is done to srcB
123 * to make their lengths equal.
124 * Instead, (outBlockSize - (srcALen + srcBLen - 1))
125 * number of output samples are made zero */
126 j = outBlockSize - (srcALen + (srcBLen - 1u));
128 /* Updating the pointer position to non zero value */
134 /* Initialization of inputA pointer */
137 /* Initialization of inputB pointer */
140 /* srcBLen is always considered as shorter or equal to srcALen */
145 /* CORR(x, y) = Reverse order(CORR(y, x)) */
146 /* Hence set the destination pointer to point to the last output sample */
147 pOut = pDst + ((srcALen + srcBLen) - 2u);
149 /* Destination address modifier is set to -1 */
154 /* The function is internally
155 * divided into three parts according to the number of multiplications that has to be
156 * taken place between inputA samples and inputB samples. In the first part of the
157 * algorithm, the multiplications increase by one for every iteration.
158 * In the second part of the algorithm, srcBLen number of multiplications are done.
159 * In the third part of the algorithm, the multiplications decrease by one
160 * for every iteration.*/
161 /* The algorithm is implemented in three stages.
162 * The loop counters of each stage is initiated here. */
163 blockSize1 = srcBLen - 1u;
164 blockSize2 = srcALen - (srcBLen - 1u);
165 blockSize3 = blockSize1;
167 /* --------------------------
168 * Initializations of stage1
169 * -------------------------*/
171 /* sum = x[0] * y[srcBlen - 1]
172 * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
174 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
177 /* In this stage the MAC operations are increased by 1 for every iteration.
178 The count variable holds the number of MAC operations performed */
181 /* Working pointer of inputA */
184 /* Working pointer of inputB */
185 pSrc1 = pIn2 + (srcBLen - 1u);
188 /* ------------------------
190 * ----------------------*/
192 /* The first loop starts here */
193 while(blockSize1 > 0u)
195 /* Accumulator is made zero for every iteration */
198 /* Apply loop unrolling and compute 4 MACs simultaneously. */
201 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
202 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
205 /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
206 sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
207 /* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */
208 sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
210 /* Decrement the loop counter */
214 /* If the count is not a multiple of 4, compute any remaining MACs here.
215 ** No loop unrolling is used. */
220 /* Perform the multiply-accumulates */
221 /* x[0] * y[srcBLen - 1] */
222 sum = __SMLAD(*px++, *py++, sum);
224 /* Decrement the loop counter */
228 /* Store the result in the accumulator in the destination buffer. */
229 *pOut = (q15_t) (sum >> 15);
230 /* Destination pointer is updated according to the address modifier, inc */
233 /* Update the inputA and inputB pointers for next MAC calculation */
237 /* Increment the MAC count */
240 /* Decrement the loop counter */
244 /* --------------------------
245 * Initializations of stage2
246 * ------------------------*/
248 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
249 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
251 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
254 /* Working pointer of inputA */
257 /* Working pointer of inputB */
260 /* count is index by which the pointer pIn1 to be incremented */
263 /* -------------------
265 * ------------------*/
267 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
268 * So, to loop unroll over blockSize2,
269 * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
272 /* Loop unroll over blockSize2, by 4 */
273 blkCnt = blockSize2 >> 2u;
277 /* Set all accumulators to zero */
283 /* read x[0], x[1] samples */
285 /* read x[1], x[2] samples */
286 x1 = _SIMD32_OFFSET(px + 1);
289 /* Apply loop unrolling and compute 4 MACs simultaneously. */
292 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
293 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
296 /* Read the first two inputB samples using SIMD:
298 c0 = *__SIMD32(py)++;
300 /* acc0 += x[0] * y[0] + x[1] * y[1] */
301 acc0 = __SMLAD(x0, c0, acc0);
303 /* acc1 += x[1] * y[0] + x[2] * y[1] */
304 acc1 = __SMLAD(x1, c0, acc1);
306 /* Read x[2], x[3] */
309 /* Read x[3], x[4] */
310 x3 = _SIMD32_OFFSET(px + 1);
312 /* acc2 += x[2] * y[0] + x[3] * y[1] */
313 acc2 = __SMLAD(x2, c0, acc2);
315 /* acc3 += x[3] * y[0] + x[4] * y[1] */
316 acc3 = __SMLAD(x3, c0, acc3);
318 /* Read y[2] and y[3] */
319 c0 = *__SIMD32(py)++;
321 /* acc0 += x[2] * y[2] + x[3] * y[3] */
322 acc0 = __SMLAD(x2, c0, acc0);
324 /* acc1 += x[3] * y[2] + x[4] * y[3] */
325 acc1 = __SMLAD(x3, c0, acc1);
327 /* Read x[4], x[5] */
328 x0 = _SIMD32_OFFSET(px + 2);
330 /* Read x[5], x[6] */
331 x1 = _SIMD32_OFFSET(px + 3);
334 /* acc2 += x[4] * y[2] + x[5] * y[3] */
335 acc2 = __SMLAD(x0, c0, acc2);
337 /* acc3 += x[5] * y[2] + x[6] * y[3] */
338 acc3 = __SMLAD(x1, c0, acc3);
342 /* For the next MAC operations, SIMD is not used
343 * So, the 16 bit pointer if inputB, py is updated */
345 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
346 ** No loop unrolling is used. */
353 #ifdef ARM_MATH_BIG_ENDIAN
359 c0 = c0 & 0x0000FFFF;
361 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
367 /* Perform the multiply-accumulates */
368 acc0 = __SMLAD(x0, c0, acc0);
369 acc1 = __SMLAD(x1, c0, acc1);
370 acc2 = __SMLADX(x1, c0, acc2);
371 acc3 = __SMLADX(x3, c0, acc3);
376 /* Read y[4], y[5] */
379 /* Read x[7], x[8] */
383 x2 = _SIMD32_OFFSET(px + 1);
386 /* Perform the multiply-accumulates */
387 acc0 = __SMLAD(x0, c0, acc0);
388 acc1 = __SMLAD(x1, c0, acc1);
389 acc2 = __SMLAD(x3, c0, acc2);
390 acc3 = __SMLAD(x2, c0, acc3);
395 /* Read y[4], y[5] */
396 c0 = *__SIMD32(py)++;
398 /* Read x[7], x[8] */
402 x2 = _SIMD32_OFFSET(px + 1);
404 /* Perform the multiply-accumulates */
405 acc0 = __SMLAD(x0, c0, acc0);
406 acc1 = __SMLAD(x1, c0, acc1);
407 acc2 = __SMLAD(x3, c0, acc2);
408 acc3 = __SMLAD(x2, c0, acc3);
412 #ifdef ARM_MATH_BIG_ENDIAN
417 c0 = c0 & 0x0000FFFF;
418 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
421 x3 = _SIMD32_OFFSET(px + 2);
424 /* Perform the multiply-accumulates */
425 acc0 = __SMLADX(x1, c0, acc0);
426 acc1 = __SMLAD(x2, c0, acc1);
427 acc2 = __SMLADX(x2, c0, acc2);
428 acc3 = __SMLADX(x3, c0, acc3);
431 /* Store the result in the accumulator in the destination buffer. */
432 *pOut = (q15_t) (acc0 >> 15);
433 /* Destination pointer is updated according to the address modifier, inc */
436 *pOut = (q15_t) (acc1 >> 15);
439 *pOut = (q15_t) (acc2 >> 15);
442 *pOut = (q15_t) (acc3 >> 15);
445 /* Increment the pointer pIn1 index, count by 1 */
448 /* Update the inputA and inputB pointers for next MAC calculation */
453 /* Decrement the loop counter */
457 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
458 ** No loop unrolling is used. */
459 blkCnt = blockSize2 % 0x4u;
463 /* Accumulator is made zero for every iteration */
466 /* Apply loop unrolling and compute 4 MACs simultaneously. */
469 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
470 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
473 /* Perform the multiply-accumulates */
474 sum += ((q31_t) * px++ * *py++);
475 sum += ((q31_t) * px++ * *py++);
476 sum += ((q31_t) * px++ * *py++);
477 sum += ((q31_t) * px++ * *py++);
479 /* Decrement the loop counter */
483 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
484 ** No loop unrolling is used. */
489 /* Perform the multiply-accumulates */
490 sum += ((q31_t) * px++ * *py++);
492 /* Decrement the loop counter */
496 /* Store the result in the accumulator in the destination buffer. */
497 *pOut = (q15_t) (sum >> 15);
498 /* Destination pointer is updated according to the address modifier, inc */
501 /* Increment the pointer pIn1 index, count by 1 */
504 /* Update the inputA and inputB pointers for next MAC calculation */
508 /* Decrement the loop counter */
514 /* If the srcBLen is not a multiple of 4,
515 * the blockSize2 loop cannot be unrolled by 4 */
520 /* Accumulator is made zero for every iteration */
523 /* Loop over srcBLen */
528 /* Perform the multiply-accumulate */
529 sum += ((q31_t) * px++ * *py++);
531 /* Decrement the loop counter */
535 /* Store the result in the accumulator in the destination buffer. */
536 *pOut = (q15_t) (sum >> 15);
537 /* Destination pointer is updated according to the address modifier, inc */
540 /* Increment the MAC count */
543 /* Update the inputA and inputB pointers for next MAC calculation */
547 /* Decrement the loop counter */
552 /* --------------------------
553 * Initializations of stage3
554 * -------------------------*/
556 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
557 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
559 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
560 * sum += x[srcALen-1] * y[0]
563 /* In this stage the MAC operations are decreased by 1 for every iteration.
564 The count variable holds the number of MAC operations performed */
565 count = srcBLen - 1u;
567 /* Working pointer of inputA */
568 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
571 /* Working pointer of inputB */
574 /* -------------------
576 * ------------------*/
578 while(blockSize3 > 0u)
580 /* Accumulator is made zero for every iteration */
583 /* Apply loop unrolling and compute 4 MACs simultaneously. */
586 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
587 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
590 /* Perform the multiply-accumulates */
591 /* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */
592 sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
593 /* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */
594 sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
596 /* Decrement the loop counter */
600 /* If the count is not a multiple of 4, compute any remaining MACs here.
601 ** No loop unrolling is used. */
606 /* Perform the multiply-accumulates */
607 sum = __SMLAD(*px++, *py++, sum);
609 /* Decrement the loop counter */
613 /* Store the result in the accumulator in the destination buffer. */
614 *pOut = (q15_t) (sum >> 15);
615 /* Destination pointer is updated according to the address modifier, inc */
618 /* Update the inputA and inputB pointers for next MAC calculation */
622 /* Decrement the MAC count */
625 /* Decrement the loop counter */
631 q15_t *pIn1; /* inputA pointer */
632 q15_t *pIn2; /* inputB pointer */
633 q15_t *pOut = pDst; /* output pointer */
634 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
635 q15_t *px; /* Intermediate inputA pointer */
636 q15_t *py; /* Intermediate inputB pointer */
637 q15_t *pSrc1; /* Intermediate pointers */
638 q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */
639 uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */
640 int32_t inc = 1; /* Destination address modifier */
644 /* The algorithm implementation is based on the lengths of the inputs. */
645 /* srcB is always made to slide across srcA. */
646 /* So srcBLen is always considered as shorter or equal to srcALen */
647 /* But CORR(x, y) is reverse of CORR(y, x) */
648 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
649 /* and the destination pointer modifier, inc is set to -1 */
650 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
651 /* But to improve the performance,
652 * we include zeroes in the output instead of zero padding either of the the inputs*/
653 /* If srcALen > srcBLen,
654 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
655 /* If srcALen < srcBLen,
656 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
657 if(srcALen >= srcBLen)
659 /* Initialization of inputA pointer */
662 /* Initialization of inputB pointer */
665 /* Number of output samples is calculated */
666 outBlockSize = (2u * srcALen) - 1u;
668 /* When srcALen > srcBLen, zero padding is done to srcB
669 * to make their lengths equal.
670 * Instead, (outBlockSize - (srcALen + srcBLen - 1))
671 * number of output samples are made zero */
672 j = outBlockSize - (srcALen + (srcBLen - 1u));
674 /* Updating the pointer position to non zero value */
680 /* Initialization of inputA pointer */
683 /* Initialization of inputB pointer */
686 /* srcBLen is always considered as shorter or equal to srcALen */
691 /* CORR(x, y) = Reverse order(CORR(y, x)) */
692 /* Hence set the destination pointer to point to the last output sample */
693 pOut = pDst + ((srcALen + srcBLen) - 2u);
695 /* Destination address modifier is set to -1 */
700 /* The function is internally
701 * divided into three parts according to the number of multiplications that has to be
702 * taken place between inputA samples and inputB samples. In the first part of the
703 * algorithm, the multiplications increase by one for every iteration.
704 * In the second part of the algorithm, srcBLen number of multiplications are done.
705 * In the third part of the algorithm, the multiplications decrease by one
706 * for every iteration.*/
707 /* The algorithm is implemented in three stages.
708 * The loop counters of each stage is initiated here. */
709 blockSize1 = srcBLen - 1u;
710 blockSize2 = srcALen - (srcBLen - 1u);
711 blockSize3 = blockSize1;
713 /* --------------------------
714 * Initializations of stage1
715 * -------------------------*/
717 /* sum = x[0] * y[srcBlen - 1]
718 * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
720 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
723 /* In this stage the MAC operations are increased by 1 for every iteration.
724 The count variable holds the number of MAC operations performed */
727 /* Working pointer of inputA */
730 /* Working pointer of inputB */
731 pSrc1 = pIn2 + (srcBLen - 1u);
734 /* ------------------------
736 * ----------------------*/
738 /* The first loop starts here */
739 while(blockSize1 > 0u)
741 /* Accumulator is made zero for every iteration */
744 /* Apply loop unrolling and compute 4 MACs simultaneously. */
747 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
748 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
751 /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
752 sum += ((q31_t) * px++ * *py++);
753 sum += ((q31_t) * px++ * *py++);
754 sum += ((q31_t) * px++ * *py++);
755 sum += ((q31_t) * px++ * *py++);
757 /* Decrement the loop counter */
761 /* If the count is not a multiple of 4, compute any remaining MACs here.
762 ** No loop unrolling is used. */
767 /* Perform the multiply-accumulates */
768 /* x[0] * y[srcBLen - 1] */
769 sum += ((q31_t) * px++ * *py++);
771 /* Decrement the loop counter */
775 /* Store the result in the accumulator in the destination buffer. */
776 *pOut = (q15_t) (sum >> 15);
777 /* Destination pointer is updated according to the address modifier, inc */
780 /* Update the inputA and inputB pointers for next MAC calculation */
784 /* Increment the MAC count */
787 /* Decrement the loop counter */
791 /* --------------------------
792 * Initializations of stage2
793 * ------------------------*/
795 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
796 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
798 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
801 /* Working pointer of inputA */
804 /* Working pointer of inputB */
807 /* count is index by which the pointer pIn1 to be incremented */
810 /* -------------------
812 * ------------------*/
814 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
815 * So, to loop unroll over blockSize2,
816 * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
819 /* Loop unroll over blockSize2, by 4 */
820 blkCnt = blockSize2 >> 2u;
824 /* Set all accumulators to zero */
830 /* read x[0], x[1], x[2] samples */
834 #ifndef ARM_MATH_BIG_ENDIAN
836 x0 = __PKHBT(a, b, 16);
838 x1 = __PKHBT(b, a, 16);
842 x0 = __PKHBT(b, a, 16);
844 x1 = __PKHBT(a, b, 16);
846 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
850 /* Apply loop unrolling and compute 4 MACs simultaneously. */
853 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
854 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
857 /* Read the first two inputB samples using SIMD:
862 #ifndef ARM_MATH_BIG_ENDIAN
864 c0 = __PKHBT(a, b, 16);
868 c0 = __PKHBT(b, a, 16);
870 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
872 /* acc0 += x[0] * y[0] + x[1] * y[1] */
873 acc0 = __SMLAD(x0, c0, acc0);
875 /* acc1 += x[1] * y[0] + x[2] * y[1] */
876 acc1 = __SMLAD(x1, c0, acc1);
878 /* Read x[2], x[3], x[4] */
882 #ifndef ARM_MATH_BIG_ENDIAN
884 x2 = __PKHBT(a, b, 16);
886 x3 = __PKHBT(b, a, 16);
890 x2 = __PKHBT(b, a, 16);
892 x3 = __PKHBT(a, b, 16);
894 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
896 /* acc2 += x[2] * y[0] + x[3] * y[1] */
897 acc2 = __SMLAD(x2, c0, acc2);
899 /* acc3 += x[3] * y[0] + x[4] * y[1] */
900 acc3 = __SMLAD(x3, c0, acc3);
902 /* Read y[2] and y[3] */
908 #ifndef ARM_MATH_BIG_ENDIAN
910 c0 = __PKHBT(a, b, 16);
914 c0 = __PKHBT(b, a, 16);
916 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
918 /* acc0 += x[2] * y[2] + x[3] * y[3] */
919 acc0 = __SMLAD(x2, c0, acc0);
921 /* acc1 += x[3] * y[2] + x[4] * y[3] */
922 acc1 = __SMLAD(x3, c0, acc1);
924 /* Read x[4], x[5], x[6] */
928 #ifndef ARM_MATH_BIG_ENDIAN
930 x0 = __PKHBT(a, b, 16);
932 x1 = __PKHBT(b, a, 16);
936 x0 = __PKHBT(b, a, 16);
938 x1 = __PKHBT(a, b, 16);
940 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
944 /* acc2 += x[4] * y[2] + x[5] * y[3] */
945 acc2 = __SMLAD(x0, c0, acc2);
947 /* acc3 += x[5] * y[2] + x[6] * y[3] */
948 acc3 = __SMLAD(x1, c0, acc3);
952 /* For the next MAC operations, SIMD is not used
953 * So, the 16 bit pointer if inputB, py is updated */
955 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
956 ** No loop unrolling is used. */
963 #ifdef ARM_MATH_BIG_ENDIAN
969 c0 = c0 & 0x0000FFFF;
971 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
979 #ifndef ARM_MATH_BIG_ENDIAN
981 x3 = __PKHBT(a, b, 16);
985 x3 = __PKHBT(b, a, 16);
987 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
991 /* Perform the multiply-accumulates */
992 acc0 = __SMLAD(x0, c0, acc0);
993 acc1 = __SMLAD(x1, c0, acc1);
994 acc2 = __SMLADX(x1, c0, acc2);
995 acc3 = __SMLADX(x3, c0, acc3);
1000 /* Read y[4], y[5] */
1004 #ifndef ARM_MATH_BIG_ENDIAN
1006 c0 = __PKHBT(a, b, 16);
1010 c0 = __PKHBT(b, a, 16);
1012 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1014 /* Read x[7], x[8], x[9] */
1018 #ifndef ARM_MATH_BIG_ENDIAN
1020 x3 = __PKHBT(a, b, 16);
1022 x2 = __PKHBT(b, a, 16);
1026 x3 = __PKHBT(b, a, 16);
1028 x2 = __PKHBT(a, b, 16);
1030 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1034 /* Perform the multiply-accumulates */
1035 acc0 = __SMLAD(x0, c0, acc0);
1036 acc1 = __SMLAD(x1, c0, acc1);
1037 acc2 = __SMLAD(x3, c0, acc2);
1038 acc3 = __SMLAD(x2, c0, acc3);
1043 /* Read y[4], y[5] */
1047 #ifndef ARM_MATH_BIG_ENDIAN
1049 c0 = __PKHBT(a, b, 16);
1053 c0 = __PKHBT(b, a, 16);
1055 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1059 /* Read x[7], x[8], x[9] */
1063 #ifndef ARM_MATH_BIG_ENDIAN
1065 x3 = __PKHBT(a, b, 16);
1067 x2 = __PKHBT(b, a, 16);
1071 x3 = __PKHBT(b, a, 16);
1073 x2 = __PKHBT(a, b, 16);
1075 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1077 /* Perform the multiply-accumulates */
1078 acc0 = __SMLAD(x0, c0, acc0);
1079 acc1 = __SMLAD(x1, c0, acc1);
1080 acc2 = __SMLAD(x3, c0, acc2);
1081 acc3 = __SMLAD(x2, c0, acc3);
1085 #ifdef ARM_MATH_BIG_ENDIAN
1090 c0 = c0 & 0x0000FFFF;
1091 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
1096 #ifndef ARM_MATH_BIG_ENDIAN
1098 x3 = __PKHBT(a, b, 16);
1102 x3 = __PKHBT(b, a, 16);
1104 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1108 /* Perform the multiply-accumulates */
1109 acc0 = __SMLADX(x1, c0, acc0);
1110 acc1 = __SMLAD(x2, c0, acc1);
1111 acc2 = __SMLADX(x2, c0, acc2);
1112 acc3 = __SMLADX(x3, c0, acc3);
1115 /* Store the result in the accumulator in the destination buffer. */
1116 *pOut = (q15_t) (acc0 >> 15);
1117 /* Destination pointer is updated according to the address modifier, inc */
1120 *pOut = (q15_t) (acc1 >> 15);
1123 *pOut = (q15_t) (acc2 >> 15);
1126 *pOut = (q15_t) (acc3 >> 15);
1129 /* Increment the pointer pIn1 index, count by 1 */
1132 /* Update the inputA and inputB pointers for next MAC calculation */
1137 /* Decrement the loop counter */
1141 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
1142 ** No loop unrolling is used. */
1143 blkCnt = blockSize2 % 0x4u;
1147 /* Accumulator is made zero for every iteration */
1150 /* Apply loop unrolling and compute 4 MACs simultaneously. */
1153 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
1154 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
1157 /* Perform the multiply-accumulates */
1158 sum += ((q31_t) * px++ * *py++);
1159 sum += ((q31_t) * px++ * *py++);
1160 sum += ((q31_t) * px++ * *py++);
1161 sum += ((q31_t) * px++ * *py++);
1163 /* Decrement the loop counter */
1167 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
1168 ** No loop unrolling is used. */
1173 /* Perform the multiply-accumulates */
1174 sum += ((q31_t) * px++ * *py++);
1176 /* Decrement the loop counter */
1180 /* Store the result in the accumulator in the destination buffer. */
1181 *pOut = (q15_t) (sum >> 15);
1182 /* Destination pointer is updated according to the address modifier, inc */
1185 /* Increment the pointer pIn1 index, count by 1 */
1188 /* Update the inputA and inputB pointers for next MAC calculation */
1192 /* Decrement the loop counter */
1198 /* If the srcBLen is not a multiple of 4,
1199 * the blockSize2 loop cannot be unrolled by 4 */
1200 blkCnt = blockSize2;
1204 /* Accumulator is made zero for every iteration */
1207 /* Loop over srcBLen */
1212 /* Perform the multiply-accumulate */
1213 sum += ((q31_t) * px++ * *py++);
1215 /* Decrement the loop counter */
1219 /* Store the result in the accumulator in the destination buffer. */
1220 *pOut = (q15_t) (sum >> 15);
1221 /* Destination pointer is updated according to the address modifier, inc */
1224 /* Increment the MAC count */
1227 /* Update the inputA and inputB pointers for next MAC calculation */
1231 /* Decrement the loop counter */
1236 /* --------------------------
1237 * Initializations of stage3
1238 * -------------------------*/
1240 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
1241 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
1243 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
1244 * sum += x[srcALen-1] * y[0]
1247 /* In this stage the MAC operations are decreased by 1 for every iteration.
1248 The count variable holds the number of MAC operations performed */
1249 count = srcBLen - 1u;
1251 /* Working pointer of inputA */
1252 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
1255 /* Working pointer of inputB */
1258 /* -------------------
1260 * ------------------*/
1262 while(blockSize3 > 0u)
1264 /* Accumulator is made zero for every iteration */
1267 /* Apply loop unrolling and compute 4 MACs simultaneously. */
1270 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
1271 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
1274 /* Perform the multiply-accumulates */
1275 sum += ((q31_t) * px++ * *py++);
1276 sum += ((q31_t) * px++ * *py++);
1277 sum += ((q31_t) * px++ * *py++);
1278 sum += ((q31_t) * px++ * *py++);
1280 /* Decrement the loop counter */
1284 /* If the count is not a multiple of 4, compute any remaining MACs here.
1285 ** No loop unrolling is used. */
1290 /* Perform the multiply-accumulates */
1291 sum += ((q31_t) * px++ * *py++);
1293 /* Decrement the loop counter */
1297 /* Store the result in the accumulator in the destination buffer. */
1298 *pOut = (q15_t) (sum >> 15);
1299 /* Destination pointer is updated according to the address modifier, inc */
1302 /* Update the inputA and inputB pointers for next MAC calculation */
1306 /* Decrement the MAC count */
1309 /* Decrement the loop counter */
1313 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
1318 * @} end of Corr group