]> git.donarmstrong.com Git - rsem.git/blob - boost/math/special_functions/detail/lanczos_sse2.hpp
Updated boost to v1.55.0
[rsem.git] / boost / math / special_functions / detail / lanczos_sse2.hpp
1 //  (C) Copyright John Maddock 2006.
2 //  Use, modification and distribution are subject to the
3 //  Boost Software License, Version 1.0. (See accompanying file
4 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
5
6 #ifndef BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS_SSE2
7 #define BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS_SSE2
8
9 #ifdef _MSC_VER
10 #pragma once
11 #endif
12
13 #include <emmintrin.h>
14
15 #if defined(__GNUC__) || defined(__PGI)
16 #define ALIGN16 __attribute__((__aligned__(16)))
17 #else
18 #define ALIGN16 __declspec(align(16))
19 #endif
20
21 namespace boost{ namespace math{ namespace lanczos{
22
23 template <>
24 inline double lanczos13m53::lanczos_sum<double>(const double& x)
25 {
26    static const ALIGN16 double coeff[26] = {
27       static_cast<double>(2.506628274631000270164908177133837338626L),
28       static_cast<double>(1u),
29       static_cast<double>(210.8242777515793458725097339207133627117L),
30       static_cast<double>(66u),
31       static_cast<double>(8071.672002365816210638002902272250613822L),
32       static_cast<double>(1925u),
33       static_cast<double>(186056.2653952234950402949897160456992822L),
34       static_cast<double>(32670u),
35       static_cast<double>(2876370.628935372441225409051620849613599L),
36       static_cast<double>(357423u),
37       static_cast<double>(31426415.58540019438061423162831820536287L),
38       static_cast<double>(2637558u),
39       static_cast<double>(248874557.8620541565114603864132294232163L),
40       static_cast<double>(13339535u),
41       static_cast<double>(1439720407.311721673663223072794912393972L),
42       static_cast<double>(45995730u),
43       static_cast<double>(6039542586.35202800506429164430729792107L),
44       static_cast<double>(105258076u),
45       static_cast<double>(17921034426.03720969991975575445893111267L),
46       static_cast<double>(150917976u),
47       static_cast<double>(35711959237.35566804944018545154716670596L),
48       static_cast<double>(120543840u),
49       static_cast<double>(42919803642.64909876895789904700198885093L),
50       static_cast<double>(39916800u),
51       static_cast<double>(23531376880.41075968857200767445163675473L),
52       static_cast<double>(0u)
53    };
54    register __m128d vx = _mm_load1_pd(&x);
55    register __m128d sum_even = _mm_load_pd(coeff);
56    register __m128d sum_odd = _mm_load_pd(coeff+2);
57    register __m128d nc_odd, nc_even;
58    register __m128d vx2 = _mm_mul_pd(vx, vx);
59
60    sum_even = _mm_mul_pd(sum_even, vx2);
61    nc_even = _mm_load_pd(coeff + 4);
62    sum_odd = _mm_mul_pd(sum_odd, vx2);
63    nc_odd = _mm_load_pd(coeff + 6);
64    sum_even = _mm_add_pd(sum_even, nc_even);
65    sum_odd = _mm_add_pd(sum_odd, nc_odd);
66
67    sum_even = _mm_mul_pd(sum_even, vx2);
68    nc_even = _mm_load_pd(coeff + 8);
69    sum_odd = _mm_mul_pd(sum_odd, vx2);
70    nc_odd = _mm_load_pd(coeff + 10);
71    sum_even = _mm_add_pd(sum_even, nc_even);
72    sum_odd = _mm_add_pd(sum_odd, nc_odd);
73
74    sum_even = _mm_mul_pd(sum_even, vx2);
75    nc_even = _mm_load_pd(coeff + 12);
76    sum_odd = _mm_mul_pd(sum_odd, vx2);
77    nc_odd = _mm_load_pd(coeff + 14);
78    sum_even = _mm_add_pd(sum_even, nc_even);
79    sum_odd = _mm_add_pd(sum_odd, nc_odd);
80
81    sum_even = _mm_mul_pd(sum_even, vx2);
82    nc_even = _mm_load_pd(coeff + 16);
83    sum_odd = _mm_mul_pd(sum_odd, vx2);
84    nc_odd = _mm_load_pd(coeff + 18);
85    sum_even = _mm_add_pd(sum_even, nc_even);
86    sum_odd = _mm_add_pd(sum_odd, nc_odd);
87
88    sum_even = _mm_mul_pd(sum_even, vx2);
89    nc_even = _mm_load_pd(coeff + 20);
90    sum_odd = _mm_mul_pd(sum_odd, vx2);
91    nc_odd = _mm_load_pd(coeff + 22);
92    sum_even = _mm_add_pd(sum_even, nc_even);
93    sum_odd = _mm_add_pd(sum_odd, nc_odd);
94
95    sum_even = _mm_mul_pd(sum_even, vx2);
96    nc_even = _mm_load_pd(coeff + 24);
97    sum_odd = _mm_mul_pd(sum_odd, vx);
98    sum_even = _mm_add_pd(sum_even, nc_even);
99    sum_even = _mm_add_pd(sum_even, sum_odd);
100
101
102    double ALIGN16 t[2];
103    _mm_store_pd(t, sum_even);
104    
105    return t[0] / t[1];
106 }
107
108 template <>
109 inline double lanczos13m53::lanczos_sum_expG_scaled<double>(const double& x)
110 {
111    static const ALIGN16 double coeff[26] = {
112          static_cast<double>(0.006061842346248906525783753964555936883222L),
113          static_cast<double>(1u),
114          static_cast<double>(0.5098416655656676188125178644804694509993L),
115          static_cast<double>(66u),
116          static_cast<double>(19.51992788247617482847860966235652136208L),
117          static_cast<double>(1925u),
118          static_cast<double>(449.9445569063168119446858607650988409623L),
119          static_cast<double>(32670u),
120          static_cast<double>(6955.999602515376140356310115515198987526L),
121          static_cast<double>(357423u),
122          static_cast<double>(75999.29304014542649875303443598909137092L),
123          static_cast<double>(2637558u),
124          static_cast<double>(601859.6171681098786670226533699352302507L),
125          static_cast<double>(13339535u),
126          static_cast<double>(3481712.15498064590882071018964774556468L),
127          static_cast<double>(45995730u),
128          static_cast<double>(14605578.08768506808414169982791359218571L),
129          static_cast<double>(105258076u),
130          static_cast<double>(43338889.32467613834773723740590533316085L),
131          static_cast<double>(150917976u),
132          static_cast<double>(86363131.28813859145546927288977868422342L),
133          static_cast<double>(120543840u),
134          static_cast<double>(103794043.1163445451906271053616070238554L),
135          static_cast<double>(39916800u),
136          static_cast<double>(56906521.91347156388090791033559122686859L),
137          static_cast<double>(0u)
138    };
139    register __m128d vx = _mm_load1_pd(&x);
140    register __m128d sum_even = _mm_load_pd(coeff);
141    register __m128d sum_odd = _mm_load_pd(coeff+2);
142    register __m128d nc_odd, nc_even;
143    register __m128d vx2 = _mm_mul_pd(vx, vx);
144
145    sum_even = _mm_mul_pd(sum_even, vx2);
146    nc_even = _mm_load_pd(coeff + 4);
147    sum_odd = _mm_mul_pd(sum_odd, vx2);
148    nc_odd = _mm_load_pd(coeff + 6);
149    sum_even = _mm_add_pd(sum_even, nc_even);
150    sum_odd = _mm_add_pd(sum_odd, nc_odd);
151
152    sum_even = _mm_mul_pd(sum_even, vx2);
153    nc_even = _mm_load_pd(coeff + 8);
154    sum_odd = _mm_mul_pd(sum_odd, vx2);
155    nc_odd = _mm_load_pd(coeff + 10);
156    sum_even = _mm_add_pd(sum_even, nc_even);
157    sum_odd = _mm_add_pd(sum_odd, nc_odd);
158
159    sum_even = _mm_mul_pd(sum_even, vx2);
160    nc_even = _mm_load_pd(coeff + 12);
161    sum_odd = _mm_mul_pd(sum_odd, vx2);
162    nc_odd = _mm_load_pd(coeff + 14);
163    sum_even = _mm_add_pd(sum_even, nc_even);
164    sum_odd = _mm_add_pd(sum_odd, nc_odd);
165
166    sum_even = _mm_mul_pd(sum_even, vx2);
167    nc_even = _mm_load_pd(coeff + 16);
168    sum_odd = _mm_mul_pd(sum_odd, vx2);
169    nc_odd = _mm_load_pd(coeff + 18);
170    sum_even = _mm_add_pd(sum_even, nc_even);
171    sum_odd = _mm_add_pd(sum_odd, nc_odd);
172
173    sum_even = _mm_mul_pd(sum_even, vx2);
174    nc_even = _mm_load_pd(coeff + 20);
175    sum_odd = _mm_mul_pd(sum_odd, vx2);
176    nc_odd = _mm_load_pd(coeff + 22);
177    sum_even = _mm_add_pd(sum_even, nc_even);
178    sum_odd = _mm_add_pd(sum_odd, nc_odd);
179
180    sum_even = _mm_mul_pd(sum_even, vx2);
181    nc_even = _mm_load_pd(coeff + 24);
182    sum_odd = _mm_mul_pd(sum_odd, vx);
183    sum_even = _mm_add_pd(sum_even, nc_even);
184    sum_even = _mm_add_pd(sum_even, sum_odd);
185
186
187    double ALIGN16 t[2];
188    _mm_store_pd(t, sum_even);
189    
190    return t[0] / t[1];
191 }
192
193 } // namespace lanczos
194 } // namespace math
195 } // namespace boost
196
197 #undef ALIGN16
198
199 #endif // BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS
200
201
202
203
204