9 #include "boost/math/distributions/normal.hpp"
16 LenDist(int minL = 1, int maxL = 1000) {
22 pdf = new double[span + 1];
23 cdf = new double[span + 1];
25 //set initial parameters
26 pdf[0] = cdf[0] = 0.0;
27 for (int i = 1; i <= span; i++) {
29 cdf[i] = i * 1.0 / span;
38 LenDist& operator=(const LenDist&);
40 void setAsNormal(double, double, int, int);
44 //the corresponding lb and ub are the original one
45 void update(int len, double frac) {
46 assert(len > lb && len <= ub);
47 pdf[len - lb] += frac;
52 int getMinL() const { return lb + 1; }
53 int getMaxL() const { return ub; }
55 double getProb(int len) const {
56 assert(len > lb && len <= ub);
60 //len : mate/fragment length
61 //refL : reference sequence length, in fact, this is totLen for global length distribution
62 double getAdjustedProb(int len, int refL) const {
63 if (len <= lb || len > ub || refL <= lb) return 0.0;
64 double denom = cdf[std::min(ub, refL) - lb];
65 assert(denom >= EPSILON);
66 return pdf[len - lb] / denom;
69 //len : length threshold, any length <= len should be calculated
70 //refL : reference sequence length
71 double getAdjustedCumulativeProb(int len, int refL) const {
72 assert(len > lb && len <= ub && refL > lb);
73 double denom = cdf[std::min(ub, refL) - lb];
74 assert(denom >= EPSILON);
75 return cdf[len - lb] / denom;
78 //for multi-thread usage
79 void collect(const LenDist&);
84 void copyTo(double*&, double*&, int&, int&, int&) const;
86 int simulate(simul*, int);
89 int lb, ub, span; // (lb, ub]
95 LenDist& LenDist::operator=(const LenDist& rv) {
96 if (this == &rv) return *this;
97 if (span != rv.span) {
100 pdf = new double[rv.span + 1];
101 cdf = new double[rv.span + 1];
103 lb = rv.lb; ub = rv.ub; span = rv.span;
104 memcpy(pdf, rv.pdf, sizeof(double) * (span + 1));
105 memcpy(cdf, rv.cdf, sizeof(double) * (span + 1));
110 //Please give interger mean, thanks!
111 //minL: new minimum length, maxL: new maximum length
112 void LenDist::setAsNormal(double mean, double sd, int minL, int maxL) {
113 int meanL = int(mean + .5); // assume meanL is a integer; if not, round to nearest number.
118 if (meanL < minL || meanL > maxL) {
119 fprintf(stderr, "Length distribution's probability mass is not within the possible range! MeanL = %d, MinL = %d, MaxL = %d\n", meanL, minL, maxL);
123 lb = meanL - 1; ub = meanL;
124 pdf = new double[span + 1];
125 cdf = new double[span + 1];
126 pdf[0] = cdf[0] = 0.0;
127 pdf[1] = cdf[1] = 1.0;
133 boost::math::normal norm(mean, sd);
135 if (maxL - minL + 1 > RANGE) {
136 if (meanL <= minL) maxL = minL + RANGE - 1;
137 else if (meanL >= maxL) minL = maxL - RANGE + 1;
139 double lg = mean - (minL - 0.5);
140 double rg = (maxL + 0.5) - mean;
141 double half = RANGE / 2.0;
143 if (lg < half) { assert(rg > half); maxL = minL + RANGE - 1; }
144 else if (rg < half) { assert(lg > half); minL = maxL - RANGE + 1; }
145 else { minL = int(mean - half + 1.0); maxL = int(mean + half); }
149 assert(maxL - minL + 1 <= RANGE);
156 pdf = new double[span + 1];
157 cdf = new double[span + 1];
159 pdf[0] = cdf[0] = 0.0;
161 double old_val, val, sum;
164 old_val = boost::math::cdf(norm, minL - 0.5);
165 for (int i = 1; i <= span; i++) {
166 val = boost::math::cdf(norm, lb + i + 0.5);
167 pdf[i] = val - old_val;
171 assert(sum >= EPSILON);
172 for (int i = 1; i <= span; i++) {
174 cdf[i] = cdf[i - 1] + pdf[i];
180 int bo = 0; // need delete
182 void LenDist::init() {
183 memset(pdf, 0, sizeof(double) * (span + 1));
184 memset(cdf, 0, sizeof(double) * (span + 1));
189 void LenDist::finish() {
192 for (int i = 1; i <= span; i++) {
196 for (int i = 1; i <= span; i++) {
197 pdf[i] = pdf[i] / sum;
198 cdf[i] = cdf[i - 1] + pdf[i];
204 void LenDist::collect(const LenDist& o) {
205 if (lb != o.lb || ub != o.ub) {
208 lb = o.lb; ub = o.ub; span = o.span;
209 pdf = new double[span + 1];
210 cdf = new double[span + 1];
211 memset(pdf, 0, sizeof(double) * (span + 1));
212 memset(cdf, 0, sizeof(double) * (span + 1));
218 for (int i = 1; i <= span; i++) {
223 void LenDist::read(FILE *fi) {
224 //release default space first
228 fscanf(fi, "%d %d %d", &lb, &ub, &span);
229 pdf = new double[span + 1];
230 cdf = new double[span + 1];
231 pdf[0] = cdf[0] = 0.0;
232 for (int i = 1; i <= span; i++) {
233 fscanf(fi, "%lf", &pdf[i]);
234 cdf[i] = cdf[i - 1] + pdf[i];
240 void LenDist::write(FILE *fo) {
241 fprintf(fo, "%d %d %d\n", lb, ub, span);
242 for (int i = 1; i < span; i++) {
243 fprintf(fo, "%.10g ", pdf[i]);
245 fprintf(fo, "%.10g\n", pdf[span]);
248 void LenDist::copyTo(double*& pdf, double*& cdf, int& lb, int& ub, int& span) const {
253 pdf = new double[span + 1];
254 memcpy(pdf, this->pdf, sizeof(double) * (span + 1));
255 cdf = new double[span + 1];
256 memcpy(cdf, this->cdf, sizeof(double) * (span + 1));
259 //refL = -1 means that this length is generated for noise isoform
260 int LenDist::simulate(simul* sampler, int refL) {
263 if (refL == -1) refL = ub;
264 if (refL <= lb || cdf[(dlen = std::min(ub, refL) - lb)] <= 0.0) return -1;
265 int len = lb + 1 + sampler->sample(cdf + 1, dlen);
270 void LenDist::trim() {
272 double *newpdf, *newcdf;
274 for (newlb = 1; newlb <= span && pdf[newlb] < EPSILON; newlb++);
276 for (newub = span; newub > newlb && pdf[newub] < EPSILON; newub--);
277 assert(newlb < newub);
278 if (newlb == 0 && newub == span) return;
280 span = newub - newlb;
281 newpdf = new double[span + 1];
282 memset(newpdf, 0, sizeof(double) * (span + 1));
283 newcdf = new double[span + 1];
284 memset(newcdf, 0, sizeof(double) * (span + 1));
286 for (int i = 1; i <= span; i++) {
287 newpdf[i] = pdf[i + newlb];
288 newcdf[i] = cdf[i + newlb];
301 #endif /* LENDIST_H_ */