boost/math/special_functions/detail/lanczos_sse2.hpp

   1 //  (C) Copyright John Maddock 2006.
   2 //  Use, modification and distribution are subject to the
   3 //  Boost Software License, Version 1.0. (See accompanying file
   4 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
   5
   6 #ifndef BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS_SSE2
   7 #define BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS_SSE2
   8
   9 #ifdef _MSC_VER
  10 #pragma once
  11 #endif
  12
  13 #include <emmintrin.h>
  14
  15 #if defined(__GNUC__) || defined(__PGI)
  16 #define ALIGN16 __attribute__((__aligned__(16)))
  17 #else
  18 #define ALIGN16 __declspec(align(16))
  19 #endif
  20
  21 namespace boost{ namespace math{ namespace lanczos{
  22
  23 template <>
  24 inline double lanczos13m53::lanczos_sum<double>(const double& x)
  25 {
  26    static const ALIGN16 double coeff[26] = {
  27       static_cast<double>(2.506628274631000270164908177133837338626L),
  28       static_cast<double>(1u),
  29       static_cast<double>(210.8242777515793458725097339207133627117L),
  30       static_cast<double>(66u),
  31       static_cast<double>(8071.672002365816210638002902272250613822L),
  32       static_cast<double>(1925u),
  33       static_cast<double>(186056.2653952234950402949897160456992822L),
  34       static_cast<double>(32670u),
  35       static_cast<double>(2876370.628935372441225409051620849613599L),
  36       static_cast<double>(357423u),
  37       static_cast<double>(31426415.58540019438061423162831820536287L),
  38       static_cast<double>(2637558u),
  39       static_cast<double>(248874557.8620541565114603864132294232163L),
  40       static_cast<double>(13339535u),
  41       static_cast<double>(1439720407.311721673663223072794912393972L),
  42       static_cast<double>(45995730u),
  43       static_cast<double>(6039542586.35202800506429164430729792107L),
  44       static_cast<double>(105258076u),
  45       static_cast<double>(17921034426.03720969991975575445893111267L),
  46       static_cast<double>(150917976u),
  47       static_cast<double>(35711959237.35566804944018545154716670596L),
  48       static_cast<double>(120543840u),
  49       static_cast<double>(42919803642.64909876895789904700198885093L),
  50       static_cast<double>(39916800u),
  51       static_cast<double>(23531376880.41075968857200767445163675473L),
  52       static_cast<double>(0u)
  53    };
  54    register __m128d vx = _mm_load1_pd(&x);
  55    register __m128d sum_even = _mm_load_pd(coeff);
  56    register __m128d sum_odd = _mm_load_pd(coeff+2);
  57    register __m128d nc_odd, nc_even;
  58    register __m128d vx2 = _mm_mul_pd(vx, vx);
  59
  60    sum_even = _mm_mul_pd(sum_even, vx2);
  61    nc_even = _mm_load_pd(coeff + 4);
  62    sum_odd = _mm_mul_pd(sum_odd, vx2);
  63    nc_odd = _mm_load_pd(coeff + 6);
  64    sum_even = _mm_add_pd(sum_even, nc_even);
  65    sum_odd = _mm_add_pd(sum_odd, nc_odd);
  66
  67    sum_even = _mm_mul_pd(sum_even, vx2);
  68    nc_even = _mm_load_pd(coeff + 8);
  69    sum_odd = _mm_mul_pd(sum_odd, vx2);
  70    nc_odd = _mm_load_pd(coeff + 10);
  71    sum_even = _mm_add_pd(sum_even, nc_even);
  72    sum_odd = _mm_add_pd(sum_odd, nc_odd);
  73
  74    sum_even = _mm_mul_pd(sum_even, vx2);
  75    nc_even = _mm_load_pd(coeff + 12);
  76    sum_odd = _mm_mul_pd(sum_odd, vx2);
  77    nc_odd = _mm_load_pd(coeff + 14);
  78    sum_even = _mm_add_pd(sum_even, nc_even);
  79    sum_odd = _mm_add_pd(sum_odd, nc_odd);
  80
  81    sum_even = _mm_mul_pd(sum_even, vx2);
  82    nc_even = _mm_load_pd(coeff + 16);
  83    sum_odd = _mm_mul_pd(sum_odd, vx2);
  84    nc_odd = _mm_load_pd(coeff + 18);
  85    sum_even = _mm_add_pd(sum_even, nc_even);
  86    sum_odd = _mm_add_pd(sum_odd, nc_odd);
  87
  88    sum_even = _mm_mul_pd(sum_even, vx2);
  89    nc_even = _mm_load_pd(coeff + 20);
  90    sum_odd = _mm_mul_pd(sum_odd, vx2);
  91    nc_odd = _mm_load_pd(coeff + 22);
  92    sum_even = _mm_add_pd(sum_even, nc_even);
  93    sum_odd = _mm_add_pd(sum_odd, nc_odd);
  94
  95    sum_even = _mm_mul_pd(sum_even, vx2);
  96    nc_even = _mm_load_pd(coeff + 24);
  97    sum_odd = _mm_mul_pd(sum_odd, vx);
  98    sum_even = _mm_add_pd(sum_even, nc_even);
  99    sum_even = _mm_add_pd(sum_even, sum_odd);
 100
 101
 102    double ALIGN16 t[2];
 103    _mm_store_pd(t, sum_even);
 104
 105    return t[0] / t[1];
 106 }
 107
 108 template <>
 109 inline double lanczos13m53::lanczos_sum_expG_scaled<double>(const double& x)
 110 {
 111    static const ALIGN16 double coeff[26] = {
 112          static_cast<double>(0.006061842346248906525783753964555936883222L),
 113          static_cast<double>(1u),
 114          static_cast<double>(0.5098416655656676188125178644804694509993L),
 115          static_cast<double>(66u),
 116          static_cast<double>(19.51992788247617482847860966235652136208L),
 117          static_cast<double>(1925u),
 118          static_cast<double>(449.9445569063168119446858607650988409623L),
 119          static_cast<double>(32670u),
 120          static_cast<double>(6955.999602515376140356310115515198987526L),
 121          static_cast<double>(357423u),
 122          static_cast<double>(75999.29304014542649875303443598909137092L),
 123          static_cast<double>(2637558u),
 124          static_cast<double>(601859.6171681098786670226533699352302507L),
 125          static_cast<double>(13339535u),
 126          static_cast<double>(3481712.15498064590882071018964774556468L),
 127          static_cast<double>(45995730u),
 128          static_cast<double>(14605578.08768506808414169982791359218571L),
 129          static_cast<double>(105258076u),
 130          static_cast<double>(43338889.32467613834773723740590533316085L),
 131          static_cast<double>(150917976u),
 132          static_cast<double>(86363131.28813859145546927288977868422342L),
 133          static_cast<double>(120543840u),
 134          static_cast<double>(103794043.1163445451906271053616070238554L),
 135          static_cast<double>(39916800u),
 136          static_cast<double>(56906521.91347156388090791033559122686859L),
 137          static_cast<double>(0u)
 138    };
 139    register __m128d vx = _mm_load1_pd(&x);
 140    register __m128d sum_even = _mm_load_pd(coeff);
 141    register __m128d sum_odd = _mm_load_pd(coeff+2);
 142    register __m128d nc_odd, nc_even;
 143    register __m128d vx2 = _mm_mul_pd(vx, vx);
 144
 145    sum_even = _mm_mul_pd(sum_even, vx2);
 146    nc_even = _mm_load_pd(coeff + 4);
 147    sum_odd = _mm_mul_pd(sum_odd, vx2);
 148    nc_odd = _mm_load_pd(coeff + 6);
 149    sum_even = _mm_add_pd(sum_even, nc_even);
 150    sum_odd = _mm_add_pd(sum_odd, nc_odd);
 151
 152    sum_even = _mm_mul_pd(sum_even, vx2);
 153    nc_even = _mm_load_pd(coeff + 8);
 154    sum_odd = _mm_mul_pd(sum_odd, vx2);
 155    nc_odd = _mm_load_pd(coeff + 10);
 156    sum_even = _mm_add_pd(sum_even, nc_even);
 157    sum_odd = _mm_add_pd(sum_odd, nc_odd);
 158
 159    sum_even = _mm_mul_pd(sum_even, vx2);
 160    nc_even = _mm_load_pd(coeff + 12);
 161    sum_odd = _mm_mul_pd(sum_odd, vx2);
 162    nc_odd = _mm_load_pd(coeff + 14);
 163    sum_even = _mm_add_pd(sum_even, nc_even);
 164    sum_odd = _mm_add_pd(sum_odd, nc_odd);
 165
 166    sum_even = _mm_mul_pd(sum_even, vx2);
 167    nc_even = _mm_load_pd(coeff + 16);
 168    sum_odd = _mm_mul_pd(sum_odd, vx2);
 169    nc_odd = _mm_load_pd(coeff + 18);
 170    sum_even = _mm_add_pd(sum_even, nc_even);
 171    sum_odd = _mm_add_pd(sum_odd, nc_odd);
 172
 173    sum_even = _mm_mul_pd(sum_even, vx2);
 174    nc_even = _mm_load_pd(coeff + 20);
 175    sum_odd = _mm_mul_pd(sum_odd, vx2);
 176    nc_odd = _mm_load_pd(coeff + 22);
 177    sum_even = _mm_add_pd(sum_even, nc_even);
 178    sum_odd = _mm_add_pd(sum_odd, nc_odd);
 179
 180    sum_even = _mm_mul_pd(sum_even, vx2);
 181    nc_even = _mm_load_pd(coeff + 24);
 182    sum_odd = _mm_mul_pd(sum_odd, vx);
 183    sum_even = _mm_add_pd(sum_even, nc_even);
 184    sum_even = _mm_add_pd(sum_even, sum_odd);
 185
 186
 187    double ALIGN16 t[2];
 188    _mm_store_pd(t, sum_even);
 189
 190    return t[0] / t[1];
 191 }
 192
 193 } // namespace lanczos
 194 } // namespace math
 195 } // namespace boost
 196
 197 #undef ALIGN16
 198
 199 #endif // BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS
 200
 201
 202
 203
 204