9 #include "boost/math/distributions/normal.hpp"
16 LenDist(int minL = 1, int maxL = 1000) {
22 pdf = new double[span + 1];
23 cdf = new double[span + 1];
25 //set initial parameters
26 pdf[0] = cdf[0] = 0.0;
27 for (int i = 1; i <= span; i++) {
29 cdf[i] = i * 1.0 / span;
38 LenDist& operator=(const LenDist&);
40 void setAsNormal(double, double, int, int);
44 //the corresponding lb and ub are the original one
45 void update(int len, double frac) {
46 assert(len > lb && len <= ub);
47 pdf[len - lb] += frac;
52 int getMinL() const { return lb + 1; }
53 int getMaxL() const { return ub; }
55 double getProb(int len) const {
56 assert(len > lb && len <= ub);
60 //len : mate/fragment length
61 //refL : reference sequence length, in fact, this is totLen for global length distribution
62 double getAdjustedProb(int len, int refL) const {
63 if (len <= lb || len > ub || refL <= lb) return 0.0;
64 double denom = cdf[std::min(ub, refL) - lb];
65 assert(denom >= EPSILON);
66 return pdf[len - lb] / denom;
69 //len : length threshold, any length <= len should be calculated
70 //refL : reference sequence length
71 double getAdjustedCumulativeProb(int len, int refL) const {
72 assert(len > lb && len <= ub && refL > lb);
73 double denom = cdf[std::min(ub, refL) - lb];
74 assert(denom >= EPSILON);
75 return cdf[len - lb] / denom;
78 //for multi-thread usage
79 void collect(const LenDist&);
84 void copyTo(double*&, double*&, int&, int&, int&) const;
86 int simulate(simul*, int);
89 int lb, ub, span; // (lb, ub]
95 LenDist& LenDist::operator=(const LenDist& rv) {
96 if (this == &rv) return *this;
97 if (span != rv.span) {
100 pdf = new double[rv.span + 1];
101 cdf = new double[rv.span + 1];
103 lb = rv.lb; ub = rv.ub; span = rv.span;
104 memcpy(pdf, rv.pdf, sizeof(double) * (span + 1));
105 memcpy(cdf, rv.cdf, sizeof(double) * (span + 1));
110 //Please give interger mean, thanks!
111 //minL: new minimum length, maxL: new maximum length
112 void LenDist::setAsNormal(double mean, double sd, int minL, int maxL) {
113 int meanL = int(mean + .5); // assume meanL is a integer; if not, round to nearest number.
118 if (meanL < minL || meanL > maxL) {
119 fprintf(stderr, "Length distribution's probability mass is not within the possible range! MeanL = %d, MinL = %d, MaxL = %d\n", meanL, minL, maxL);
123 lb = meanL - 1; ub = meanL;
124 pdf = new double[span + 1];
125 cdf = new double[span + 1];
126 pdf[0] = cdf[0] = 0.0;
127 pdf[1] = cdf[1] = 1.0;
133 boost::math::normal norm(mean, sd);
135 if (maxL - minL + 1 > RANGE) {
136 if (meanL <= minL) maxL = minL + RANGE - 1;
137 else if (meanL >= maxL) minL = maxL - RANGE + 1;
139 double lg = mean - (minL - 0.5);
140 double rg = (maxL + 0.5) - mean;
141 double half = RANGE / 2.0;
143 if (lg < half) { assert(rg > half); maxL = minL + RANGE - 1; }
144 else if (rg < half) { assert(lg > half); minL = maxL - RANGE + 1; }
145 else { minL = int(mean - half + 1.0); maxL = int(mean + half); }
149 assert(maxL - minL + 1 <= RANGE);
156 pdf = new double[span + 1];
157 cdf = new double[span + 1];
159 pdf[0] = cdf[0] = 0.0;
161 double old_val, val, sum;
164 old_val = boost::math::cdf(norm, minL - 0.5);
165 for (int i = 1; i <= span; i++) {
166 val = boost::math::cdf(norm, lb + i + 0.5);
167 pdf[i] = val - old_val;
171 assert(sum >= EPSILON);
172 for (int i = 1; i <= span; i++) {
174 cdf[i] = cdf[i - 1] + pdf[i];
180 void LenDist::init() {
181 memset(pdf, 0, sizeof(double) * (span + 1));
182 memset(cdf, 0, sizeof(double) * (span + 1));
185 void LenDist::finish() {
188 for (int i = 1; i <= span; i++) {
192 for (int i = 1; i <= span; i++) {
193 pdf[i] = pdf[i] / sum;
194 cdf[i] = cdf[i - 1] + pdf[i];
200 void LenDist::collect(const LenDist& o) {
201 if (lb != o.lb || ub != o.ub) {
204 lb = o.lb; ub = o.ub; span = o.span;
205 pdf = new double[span + 1];
206 cdf = new double[span + 1];
207 memset(pdf, 0, sizeof(double) * (span + 1));
208 memset(cdf, 0, sizeof(double) * (span + 1));
210 for (int i = 1; i <= span; i++) {
215 void LenDist::read(FILE *fi) {
216 //release default space first
220 fscanf(fi, "%d %d %d", &lb, &ub, &span);
221 pdf = new double[span + 1];
222 cdf = new double[span + 1];
223 pdf[0] = cdf[0] = 0.0;
224 for (int i = 1; i <= span; i++) {
225 fscanf(fi, "%lf", &pdf[i]);
226 cdf[i] = cdf[i - 1] + pdf[i];
232 void LenDist::write(FILE *fo) {
233 fprintf(fo, "%d %d %d\n", lb, ub, span);
234 for (int i = 1; i < span; i++) {
235 fprintf(fo, "%.10g ", pdf[i]);
237 fprintf(fo, "%.10g\n", pdf[span]);
240 void LenDist::copyTo(double*& pdf, double*& cdf, int& lb, int& ub, int& span) const {
245 pdf = new double[span + 1];
246 memcpy(pdf, this->pdf, sizeof(double) * (span + 1));
247 cdf = new double[span + 1];
248 memcpy(cdf, this->cdf, sizeof(double) * (span + 1));
251 //refL = -1 means that this length is generated for noise isoform
252 int LenDist::simulate(simul* sampler, int refL) {
255 if (refL == -1) refL = ub;
256 if (refL <= lb || cdf[(dlen = std::min(ub, refL) - lb)] <= 0.0) return -1;
257 int len = lb + 1 + sampler->sample(cdf + 1, dlen);
262 void LenDist::trim() {
264 double *newpdf, *newcdf;
266 for (newlb = 1; newlb <= span && pdf[newlb] < EPSILON; newlb++);
268 for (newub = span; newub > newlb && pdf[newub] < EPSILON; newub--);
269 assert(newlb < newub);
270 if (newlb == 0 && newub == span) return;
272 span = newub - newlb;
273 newpdf = new double[span + 1];
274 memset(newpdf, 0, sizeof(double) * (span + 1));
275 newcdf = new double[span + 1];
276 memset(newcdf, 0, sizeof(double) * (span + 1));
278 for (int i = 1; i <= span; i++) {
279 newpdf[i] = pdf[i + newlb];
280 newcdf[i] = cdf[i + newlb];
293 #endif /* LENDIST_H_ */