tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/TransformFunctions/arm_dct4_q31.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        17. January 2013
   5 * $Revision:    V1.4.1
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:            arm_dct4_q31.c
   9 *
  10 * Description:  Processing function of DCT4 & IDCT4 Q31.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  13 *
  14 * Redistribution and use in source and binary forms, with or without
  15 * modification, are permitted provided that the following conditions
  16 * are met:
  17 *   - Redistributions of source code must retain the above copyright
  18 *     notice, this list of conditions and the following disclaimer.
  19 *   - Redistributions in binary form must reproduce the above copyright
  20 *     notice, this list of conditions and the following disclaimer in
  21 *     the documentation and/or other materials provided with the
  22 *     distribution.
  23 *   - Neither the name of ARM LIMITED nor the names of its contributors
  24 *     may be used to endorse or promote products derived from this
  25 *     software without specific prior written permission.
  26 *
  27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38 * POSSIBILITY OF SUCH DAMAGE.
  39 * -------------------------------------------------------------------- */
  40
  41 #include "arm_math.h"
  42
  43 /**
  44  * @addtogroup DCT4_IDCT4
  45  * @{
  46  */
  47
  48 /**
  49  * @brief Processing function for the Q31 DCT4/IDCT4.
  50  * @param[in]       *S             points to an instance of the Q31 DCT4 structure.
  51  * @param[in]       *pState        points to state buffer.
  52  * @param[in,out]   *pInlineBuffer points to the in-place input and output buffer.
  53  * @return none.
  54  * \par Input an output formats:
  55  * Input samples need to be downscaled by 1 bit to avoid saturations in the Q31 DCT process,
  56  * as the conversion from DCT2 to DCT4 involves one subtraction.
  57  * Internally inputs are downscaled in the RFFT process function to avoid overflows.
  58  * Number of bits downscaled, depends on the size of the transform.
  59  * The input and output formats for different DCT sizes and number of bits to upscale are mentioned in the table below:
  60  *
  61  * \image html dct4FormatsQ31Table.gif
  62  */
  63
  64 void arm_dct4_q31(
  65   const arm_dct4_instance_q31 * S,
  66   q31_t * pState,
  67   q31_t * pInlineBuffer)
  68 {
  69   uint16_t i;                                    /* Loop counter */
  70   q31_t *weights = S->pTwiddle;                  /* Pointer to the Weights table */
  71   q31_t *cosFact = S->pCosFactor;                /* Pointer to the cos factors table */
  72   q31_t *pS1, *pS2, *pbuff;                      /* Temporary pointers for input buffer and pState buffer */
  73   q31_t in;                                      /* Temporary variable */
  74
  75
  76   /* DCT4 computation involves DCT2 (which is calculated using RFFT)
  77    * along with some pre-processing and post-processing.
  78    * Computational procedure is explained as follows:
  79    * (a) Pre-processing involves multiplying input with cos factor,
  80    *     r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n))
  81    *              where,
  82    *                 r(n) -- output of preprocessing
  83    *                 u(n) -- input to preprocessing(actual Source buffer)
  84    * (b) Calculation of DCT2 using FFT is divided into three steps:
  85    *                  Step1: Re-ordering of even and odd elements of input.
  86    *                  Step2: Calculating FFT of the re-ordered input.
  87    *                  Step3: Taking the real part of the product of FFT output and weights.
  88    * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation:
  89    *                   Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
  90    *                        where,
  91    *                           Y4 -- DCT4 output,   Y2 -- DCT2 output
  92    * (d) Multiplying the output with the normalizing factor sqrt(2/N).
  93    */
  94
  95         /*-------- Pre-processing ------------*/
  96   /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */
  97   arm_mult_q31(pInlineBuffer, cosFact, pInlineBuffer, S->N);
  98   arm_shift_q31(pInlineBuffer, 1, pInlineBuffer, S->N);
  99
 100   /* ----------------------------------------------------------------
 101    * Step1: Re-ordering of even and odd elements as
 102    *             pState[i] =  pInlineBuffer[2*i] and
 103    *             pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2
 104    ---------------------------------------------------------------------*/
 105
 106   /* pS1 initialized to pState */
 107   pS1 = pState;
 108
 109   /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */
 110   pS2 = pState + (S->N - 1u);
 111
 112   /* pbuff initialized to input buffer */
 113   pbuff = pInlineBuffer;
 114
 115 #ifndef ARM_MATH_CM0_FAMILY
 116
 117   /* Run the below code for Cortex-M4 and Cortex-M3 */
 118
 119   /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */
 120   i = S->Nby2 >> 2u;
 121
 122   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
 123    ** a second loop below computes the remaining 1 to 3 samples. */
 124   do
 125   {
 126     /* Re-ordering of even and odd elements */
 127     /* pState[i] =  pInlineBuffer[2*i] */
 128     *pS1++ = *pbuff++;
 129     /* pState[N-i-1] = pInlineBuffer[2*i+1] */
 130     *pS2-- = *pbuff++;
 131
 132     *pS1++ = *pbuff++;
 133     *pS2-- = *pbuff++;
 134
 135     *pS1++ = *pbuff++;
 136     *pS2-- = *pbuff++;
 137
 138     *pS1++ = *pbuff++;
 139     *pS2-- = *pbuff++;
 140
 141     /* Decrement the loop counter */
 142     i--;
 143   } while(i > 0u);
 144
 145   /* pbuff initialized to input buffer */
 146   pbuff = pInlineBuffer;
 147
 148   /* pS1 initialized to pState */
 149   pS1 = pState;
 150
 151   /* Initializing the loop counter to N/4 instead of N for loop unrolling */
 152   i = S->N >> 2u;
 153
 154   /* Processing with loop unrolling 4 times as N is always multiple of 4.
 155    * Compute 4 outputs at a time */
 156   do
 157   {
 158     /* Writing the re-ordered output back to inplace input buffer */
 159     *pbuff++ = *pS1++;
 160     *pbuff++ = *pS1++;
 161     *pbuff++ = *pS1++;
 162     *pbuff++ = *pS1++;
 163
 164     /* Decrement the loop counter */
 165     i--;
 166   } while(i > 0u);
 167
 168
 169   /* ---------------------------------------------------------
 170    *     Step2: Calculate RFFT for N-point input
 171    * ---------------------------------------------------------- */
 172   /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
 173   arm_rfft_q31(S->pRfft, pInlineBuffer, pState);
 174
 175   /*----------------------------------------------------------------------
 176    *  Step3: Multiply the FFT output with the weights.
 177    *----------------------------------------------------------------------*/
 178   arm_cmplx_mult_cmplx_q31(pState, weights, pState, S->N);
 179
 180   /* The output of complex multiplication is in 3.29 format.
 181    * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.31 format by shifting left by 2 bits. */
 182   arm_shift_q31(pState, 2, pState, S->N * 2);
 183
 184   /* ----------- Post-processing ---------- */
 185   /* DCT-IV can be obtained from DCT-II by the equation,
 186    *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
 187    *       Hence, Y4(0) = Y2(0)/2  */
 188   /* Getting only real part from the output and Converting to DCT-IV */
 189
 190   /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */
 191   i = (S->N - 1u) >> 2u;
 192
 193   /* pbuff initialized to input buffer. */
 194   pbuff = pInlineBuffer;
 195
 196   /* pS1 initialized to pState */
 197   pS1 = pState;
 198
 199   /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
 200   in = *pS1++ >> 1u;
 201   /* input buffer acts as inplace, so output values are stored in the input itself. */
 202   *pbuff++ = in;
 203
 204   /* pState pointer is incremented twice as the real values are located alternatively in the array */
 205   pS1++;
 206
 207   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
 208    ** a second loop below computes the remaining 1 to 3 samples. */
 209   do
 210   {
 211     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
 212     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
 213     in = *pS1++ - in;
 214     *pbuff++ = in;
 215     /* points to the next real value */
 216     pS1++;
 217
 218     in = *pS1++ - in;
 219     *pbuff++ = in;
 220     pS1++;
 221
 222     in = *pS1++ - in;
 223     *pbuff++ = in;
 224     pS1++;
 225
 226     in = *pS1++ - in;
 227     *pbuff++ = in;
 228     pS1++;
 229
 230     /* Decrement the loop counter */
 231     i--;
 232   } while(i > 0u);
 233
 234   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
 235    ** No loop unrolling is used. */
 236   i = (S->N - 1u) % 0x4u;
 237
 238   while(i > 0u)
 239   {
 240     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
 241     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
 242     in = *pS1++ - in;
 243     *pbuff++ = in;
 244     /* points to the next real value */
 245     pS1++;
 246
 247     /* Decrement the loop counter */
 248     i--;
 249   }
 250
 251
 252         /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
 253
 254   /* Initializing the loop counter to N/4 instead of N for loop unrolling */
 255   i = S->N >> 2u;
 256
 257   /* pbuff initialized to the pInlineBuffer(now contains the output values) */
 258   pbuff = pInlineBuffer;
 259
 260   /* Processing with loop unrolling 4 times as N is always multiple of 4.  Compute 4 outputs at a time */
 261   do
 262   {
 263     /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
 264     in = *pbuff;
 265     *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
 266
 267     in = *pbuff;
 268     *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
 269
 270     in = *pbuff;
 271     *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
 272
 273     in = *pbuff;
 274     *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
 275
 276     /* Decrement the loop counter */
 277     i--;
 278   } while(i > 0u);
 279
 280
 281 #else
 282
 283   /* Run the below code for Cortex-M0 */
 284
 285   /* Initializing the loop counter to N/2 */
 286   i = S->Nby2;
 287
 288   do
 289   {
 290     /* Re-ordering of even and odd elements */
 291     /* pState[i] =  pInlineBuffer[2*i] */
 292     *pS1++ = *pbuff++;
 293     /* pState[N-i-1] = pInlineBuffer[2*i+1] */
 294     *pS2-- = *pbuff++;
 295
 296     /* Decrement the loop counter */
 297     i--;
 298   } while(i > 0u);
 299
 300   /* pbuff initialized to input buffer */
 301   pbuff = pInlineBuffer;
 302
 303   /* pS1 initialized to pState */
 304   pS1 = pState;
 305
 306   /* Initializing the loop counter */
 307   i = S->N;
 308
 309   do
 310   {
 311     /* Writing the re-ordered output back to inplace input buffer */
 312     *pbuff++ = *pS1++;
 313
 314     /* Decrement the loop counter */
 315     i--;
 316   } while(i > 0u);
 317
 318
 319   /* ---------------------------------------------------------
 320    *     Step2: Calculate RFFT for N-point input
 321    * ---------------------------------------------------------- */
 322   /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
 323   arm_rfft_q31(S->pRfft, pInlineBuffer, pState);
 324
 325   /*----------------------------------------------------------------------
 326    *  Step3: Multiply the FFT output with the weights.
 327    *----------------------------------------------------------------------*/
 328   arm_cmplx_mult_cmplx_q31(pState, weights, pState, S->N);
 329
 330   /* The output of complex multiplication is in 3.29 format.
 331    * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.31 format by shifting left by 2 bits. */
 332   arm_shift_q31(pState, 2, pState, S->N * 2);
 333
 334   /* ----------- Post-processing ---------- */
 335   /* DCT-IV can be obtained from DCT-II by the equation,
 336    *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
 337    *       Hence, Y4(0) = Y2(0)/2  */
 338   /* Getting only real part from the output and Converting to DCT-IV */
 339
 340   /* pbuff initialized to input buffer. */
 341   pbuff = pInlineBuffer;
 342
 343   /* pS1 initialized to pState */
 344   pS1 = pState;
 345
 346   /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
 347   in = *pS1++ >> 1u;
 348   /* input buffer acts as inplace, so output values are stored in the input itself. */
 349   *pbuff++ = in;
 350
 351   /* pState pointer is incremented twice as the real values are located alternatively in the array */
 352   pS1++;
 353
 354   /* Initializing the loop counter */
 355   i = (S->N - 1u);
 356
 357   while(i > 0u)
 358   {
 359     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
 360     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
 361     in = *pS1++ - in;
 362     *pbuff++ = in;
 363     /* points to the next real value */
 364     pS1++;
 365
 366     /* Decrement the loop counter */
 367     i--;
 368   }
 369
 370
 371         /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
 372
 373   /* Initializing the loop counter */
 374   i = S->N;
 375
 376   /* pbuff initialized to the pInlineBuffer(now contains the output values) */
 377   pbuff = pInlineBuffer;
 378
 379   do
 380   {
 381     /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
 382     in = *pbuff;
 383     *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
 384
 385     /* Decrement the loop counter */
 386     i--;
 387   } while(i > 0u);
 388
 389 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
 390
 391 }
 392
 393 /**
 394    * @} end of DCT4_IDCT4 group
 395    */