10 #include "boost/math/distributions/normal.hpp"
17 LenDist(int minL = 1, int maxL = 1000) {
23 pdf = new double[span + 1];
24 cdf = new double[span + 1];
26 //set initial parameters
27 pdf[0] = cdf[0] = 0.0;
28 for (int i = 1; i <= span; i++) {
30 cdf[i] = i * 1.0 / span;
39 LenDist& operator=(const LenDist&);
41 void setAsNormal(double, double, int, int);
45 //the corresponding lb and ub are the original one
46 void update(int len, double frac) {
47 assert(len > lb && len <= ub);
48 pdf[len - lb] += frac;
53 int getMinL() const { return lb + 1; }
54 int getMaxL() const { return ub; }
56 double getProb(int len) const {
57 assert(len > lb && len <= ub);
61 //len : mate/fragment length
62 //refL : reference sequence length, in fact, this is totLen for global length distribution
63 double getAdjustedProb(int len, int refL) const {
64 if (len <= lb || len > ub || refL <= lb) return 0.0;
65 double denom = cdf[std::min(ub, refL) - lb];
66 assert(denom >= EPSILON);
67 return pdf[len - lb] / denom;
70 //len : length threshold, any length <= len should be calculated
71 //refL : reference sequence length
72 double getAdjustedCumulativeProb(int len, int refL) const {
73 assert(len > lb && len <= ub && refL > lb);
74 double denom = cdf[std::min(ub, refL) - lb];
75 assert(denom >= EPSILON);
76 return cdf[len - lb] / denom;
79 //for multi-thread usage
80 void collect(const LenDist&);
85 void copyTo(double*&, double*&, int&, int&, int&) const;
87 int simulate(simul*, int);
90 int lb, ub, span; // (lb, ub]
96 LenDist& LenDist::operator=(const LenDist& rv) {
97 if (this == &rv) return *this;
98 if (span != rv.span) {
101 pdf = new double[rv.span + 1];
102 cdf = new double[rv.span + 1];
104 lb = rv.lb; ub = rv.ub; span = rv.span;
105 memcpy(pdf, rv.pdf, sizeof(double) * (span + 1));
106 memcpy(cdf, rv.cdf, sizeof(double) * (span + 1));
111 //Please give interger mean, thanks!
112 //minL: new minimum length, maxL: new maximum length
113 void LenDist::setAsNormal(double mean, double sd, int minL, int maxL) {
114 int meanL = int(mean + .5); // assume meanL is a integer; if not, round to nearest number.
119 if (meanL < minL || meanL > maxL) {
120 fprintf(stderr, "Length distribution's probability mass is not within the possible range! MeanL = %d, MinL = %d, MaxL = %d\n", meanL, minL, maxL);
124 lb = meanL - 1; ub = meanL;
125 pdf = new double[span + 1];
126 cdf = new double[span + 1];
127 pdf[0] = cdf[0] = 0.0;
128 pdf[1] = cdf[1] = 1.0;
134 boost::math::normal norm(mean, sd);
136 if (maxL - minL + 1 > RANGE) {
137 if (meanL <= minL) maxL = minL + RANGE - 1;
138 else if (meanL >= maxL) minL = maxL - RANGE + 1;
140 double lg = mean - (minL - 0.5);
141 double rg = (maxL + 0.5) - mean;
142 double half = RANGE / 2.0;
144 if (lg < half) { assert(rg > half); maxL = minL + RANGE - 1; }
145 else if (rg < half) { assert(lg > half); minL = maxL - RANGE + 1; }
146 else { minL = int(mean - half + 1.0); maxL = int(mean + half); }
150 assert(maxL - minL + 1 <= RANGE);
157 pdf = new double[span + 1];
158 cdf = new double[span + 1];
160 pdf[0] = cdf[0] = 0.0;
162 double old_val, val, sum;
165 old_val = boost::math::cdf(norm, minL - 0.5);
166 for (int i = 1; i <= span; i++) {
167 val = boost::math::cdf(norm, lb + i + 0.5);
168 pdf[i] = val - old_val;
172 assert(sum >= EPSILON);
173 for (int i = 1; i <= span; i++) {
175 cdf[i] = cdf[i - 1] + pdf[i];
181 void LenDist::init() {
182 memset(pdf, 0, sizeof(double) * (span + 1));
183 memset(cdf, 0, sizeof(double) * (span + 1));
186 void LenDist::finish() {
189 for (int i = 1; i <= span; i++) {
193 if (sum <= EPSILON) { fprintf(stderr, "No valid read to estimate the length distribution!\n"); exit(-1); }
195 for (int i = 1; i <= span; i++) {
196 pdf[i] = pdf[i] / sum;
197 cdf[i] = cdf[i - 1] + pdf[i];
203 void LenDist::collect(const LenDist& o) {
204 if (lb != o.lb || ub != o.ub) {
207 lb = o.lb; ub = o.ub; span = o.span;
208 pdf = new double[span + 1];
209 cdf = new double[span + 1];
210 memset(pdf, 0, sizeof(double) * (span + 1));
211 memset(cdf, 0, sizeof(double) * (span + 1));
213 for (int i = 1; i <= span; i++) {
218 void LenDist::read(FILE *fi) {
219 //release default space first
223 assert(fscanf(fi, "%d %d %d", &lb, &ub, &span) == 3);
224 pdf = new double[span + 1];
225 cdf = new double[span + 1];
226 pdf[0] = cdf[0] = 0.0;
227 for (int i = 1; i <= span; i++) {
228 assert(fscanf(fi, "%lf", &pdf[i]) == 1);
229 cdf[i] = cdf[i - 1] + pdf[i];
235 void LenDist::write(FILE *fo) {
236 fprintf(fo, "%d %d %d\n", lb, ub, span);
237 for (int i = 1; i < span; i++) {
238 fprintf(fo, "%.10g ", pdf[i]);
240 fprintf(fo, "%.10g\n", pdf[span]);
243 void LenDist::copyTo(double*& pdf, double*& cdf, int& lb, int& ub, int& span) const {
248 pdf = new double[span + 1];
249 memcpy(pdf, this->pdf, sizeof(double) * (span + 1));
250 cdf = new double[span + 1];
251 memcpy(cdf, this->cdf, sizeof(double) * (span + 1));
254 //refL = -1 means that this length is generated for noise isoform
255 int LenDist::simulate(simul* sampler, int refL) {
258 if (refL == -1) refL = ub;
259 if (refL <= lb || cdf[(dlen = std::min(ub, refL) - lb)] <= 0.0) return -1;
260 int len = lb + 1 + sampler->sample(cdf + 1, dlen);
265 void LenDist::trim() {
267 double *newpdf, *newcdf;
269 for (newlb = 1; newlb <= span && pdf[newlb] < EPSILON; newlb++);
271 for (newub = span; newub > newlb && pdf[newub] < EPSILON; newub--);
272 assert(newlb < newub);
273 if (newlb == 0 && newub == span) return;
275 span = newub - newlb;
276 newpdf = new double[span + 1];
277 memset(newpdf, 0, sizeof(double) * (span + 1));
278 newcdf = new double[span + 1];
279 memset(newcdf, 0, sizeof(double) * (span + 1));
281 for (int i = 1; i <= span; i++) {
282 newpdf[i] = pdf[i + newlb];
283 newcdf[i] = cdf[i + newlb];
296 #endif /* LENDIST_H_ */