#include<string>
#include<algorithm>
#include<sstream>
+#include<iostream>
+#include<vector>
#include "utils.h"
+#include "my_assert.h"
#include "Orientation.h"
#include "LenDist.h"
#include "RSPD.h"
int readLen = read.getReadLength();
int fpos = (dir == 0 ? pos : totLen - pos - readLen); // the aligned position reported in SAM file, should be a coordinate in forward strand
- assert(fpos >= 0 && fpos + readLen <= totLen && readLen <= totLen);
+ general_assert(fpos >= 0, "The alignment of read " + read.getName() + " to transcript " + itos(sid) + " starts at " + itos(fpos) + \
+ " from the forward direction, which should be a non-negative number! " + \
+ "It is possible that the aligner you use gave different read lengths for a same read in SAM file.");
+ general_assert(fpos + readLen <= totLen,"Read " + read.getName() + " is hung over the end of transcript " + itos(sid) + "! " \
+ + "It is possible that the aligner you use gave different read lengths for a same read in SAM file.");
+ general_assert(readLen <= totLen, "Read " + read.getName() + " has length " + itos(readLen) + ", but it is aligned to transcript " \
+ + itos(sid) + ", whose length (" + itos(totLen) + ") is shorter than the read's length!");
+
int seedPos = (dir == 0 ? pos : totLen - pos - seedLen); // the aligned position of the seed in forward strand coordinates
if (seedPos >= fullLen || ref.getMask(seedPos)) return 0.0;
const LenDist& getGLD() { return *gld; }
- void startSimulation(simul*, double*);
- bool simulate(int, SingleReadQ&, int&);
+ void startSimulation(simul*, const std::vector<double>&);
+ bool simulate(READ_INT_TYPE, SingleReadQ&, int&);
void finishSimulation();
//Use it after function 'read' or 'estimateFromReads'
- double* getMW() {
+ const double* getMW() {
assert(mw != NULL);
return mw;
}
static const int read_type = 1;
int M;
- int N[3];
+ READ_INT_TYPE N[3];
Refs *refs;
double mean, sd;
int seedLen;
for (int i = 0; i < 3; i++)
if (N[i] > 0) {
genReadFileNames(readFN, i, read_type, s, readFs);
- ReadReader<SingleReadQ> reader(s, readFs);
+ ReadReader<SingleReadQ> reader(s, readFs, refs->hasPolyA(), seedLen); // allow calculation of calc_lq() function
- int cnt = 0;
+ READ_INT_TYPE cnt = 0;
while (reader.next(read)) {
- mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0);
- qd->update(read.getQScore());
- if (i == 0) { nqpro->updateC(read.getReadSeq(), read.getQScore()); }
+ if (!read.isLowQuality()) {
+ mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0);
+ qd->update(read.getQScore());
+ if (i == 0) { nqpro->updateC(read.getReadSeq(), read.getQScore()); }
+ }
+ else if (verbose && read.getReadLength() < seedLen) {
+ std::cout<< "Warning: Read "<< read.getName()<< " is ignored due to read length "<< read.getReadLength()<< " < seed length "<< seedLen<< "!"<< std::endl;
+ }
++cnt;
- if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); }
+ if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " READS PROCESSED"<< std::endl; }
}
- if (verbose) { printf("estimateFromReads, N%d finished.\n", i); }
+ if (verbose) { std::cout<< "estimateFromReads, N"<< i<< " finished."<< std::endl; }
}
mld != NULL ? mld->finish() : gld->finish();
FILE *fi = fopen(inpF, "r");
if (fi == NULL) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
- fscanf(fi, "%d", &val);
+ assert(fscanf(fi, "%d", &val) == 1);
assert(val == model_type);
ori->read(fi);
gld->read(fi);
- fscanf(fi, "%d", &val);
+ assert(fscanf(fi, "%d", &val) == 1);
if (val > 0) {
if (mld == NULL) mld = new LenDist();
mld->read(fi);
if (M == 0) M = val;
if (M == val) {
mw = new double[M + 1];
- for (int i = 0; i <= M; i++) fscanf(fi, "%lf", &mw[i]);
+ for (int i = 0; i <= M; i++) assert(fscanf(fi, "%lf", &mw[i]) == 1);
}
}
fclose(fo);
}
-void SingleQModel::startSimulation(simul* sampler, double* theta) {
+void SingleQModel::startSimulation(simul* sampler, const std::vector<double>& theta) {
this->sampler = sampler;
theta_cdf = new double[M + 1];
nqpro->startSimulation();
}
-bool SingleQModel::simulate(int rid, SingleReadQ& read, int& sid) {
+bool SingleQModel::simulate(READ_INT_TYPE rid, SingleReadQ& read, int& sid) {
int dir, pos, readLen, fragLen;
std::string name;
std::string qual, readseq;
}
}
- std::ostringstream stdout;
- stdout<<rid<<"_"<<dir<<"_"<<sid<<"_"<<pos;
- name = stdout.str();
+ strout<<rid<<"_"<<dir<<"_"<<sid<<"_"<<pos;
+ name = strout.str();
read = SingleReadQ(name, readseq, qual);
}
void SingleQModel::calcMW() {
- double probF, probR;
-
- assert(seedLen >= OLEN && (mld == NULL ? gld->getMinL() : mld->getMinL()) >= seedLen);
-
- memset(mw, 0, sizeof(double) * (M + 1));
- mw[0] = 1.0;
-
- probF = ori->getProb(0);
- probR = ori->getProb(1);
-
- for (int i = 1; i <= M; i++) {
- RefSeq& ref = refs->getRef(i);
- int totLen = ref.getTotLen();
- int fullLen = ref.getFullLen();
- double value = 0.0;
- int minL, maxL;
- int effL, pfpos;
- int end = std::min(fullLen, totLen - seedLen + 1);
- double factor;
-
- for (int seedPos = 0; seedPos < end; seedPos++)
- if (ref.getMask(seedPos)) {
- //forward
- minL = gld->getMinL();
- maxL = std::min(gld->getMaxL(), totLen - seedPos);
- pfpos = seedPos;
- for (int fragLen = minL; fragLen <= maxL; fragLen++) {
- effL = std::min(fullLen, totLen - fragLen + 1);
- factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
- value += probF * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
- }
- //reverse
- minL = gld->getMinL();
- maxL = std::min(gld->getMaxL(), seedPos + seedLen);
- for (int fragLen = minL; fragLen <= maxL; fragLen++) {
- pfpos = seedPos - (fragLen - seedLen);
- effL = std::min(fullLen, totLen - fragLen + 1);
- factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
- value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
+ double probF, probR;
+
+ assert((mld == NULL ? gld->getMinL() : mld->getMinL()) >= seedLen);
+
+ memset(mw, 0, sizeof(double) * (M + 1));
+ mw[0] = 1.0;
+
+ probF = ori->getProb(0);
+ probR = ori->getProb(1);
+
+ for (int i = 1; i <= M; i++) {
+ RefSeq& ref = refs->getRef(i);
+ int totLen = ref.getTotLen();
+ int fullLen = ref.getFullLen();
+ double value = 0.0;
+ int minL, maxL;
+ int effL, pfpos;
+ int end = std::min(fullLen, totLen - seedLen + 1);
+ double factor;
+
+ for (int seedPos = 0; seedPos < end; seedPos++)
+ if (ref.getMask(seedPos)) {
+ //forward
+ minL = gld->getMinL();
+ maxL = std::min(gld->getMaxL(), totLen - seedPos);
+ pfpos = seedPos;
+ for (int fragLen = minL; fragLen <= maxL; fragLen++) {
+ effL = std::min(fullLen, totLen - fragLen + 1);
+ factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
+ value += probF * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
+ }
+ //reverse
+ minL = gld->getMinL();
+ maxL = std::min(gld->getMaxL(), seedPos + seedLen);
+ for (int fragLen = minL; fragLen <= maxL; fragLen++) {
+ pfpos = seedPos - (fragLen - seedLen);
+ effL = std::min(fullLen, totLen - fragLen + 1);
+ factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
+ value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
+ }
+ }
+
+ //for reverse strand masking
+ for (int seedPos = end; seedPos <= totLen - seedLen; seedPos++) {
+ minL = std::max(gld->getMinL(), seedPos + seedLen - fullLen + 1);
+ maxL = std::min(gld->getMaxL(), seedPos + seedLen);
+ for (int fragLen = minL; fragLen <= maxL; fragLen++) {
+ pfpos = seedPos - (fragLen - seedLen);
+ effL = std::min(fullLen, totLen - fragLen + 1);
+ factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
+ value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
+ }
+ }
+
+ mw[i] = 1.0 - value;
+
+ if (mw[i] < 1e-8) {
+ // fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i);
+ mw[i] = 0.0;
+ }
}
- }
-
- //for reverse strand masking
- for (int seedPos = end; seedPos <= totLen - seedLen; seedPos++) {
- minL = std::max(gld->getMinL(), seedPos + seedLen - fullLen + 1);
- maxL = std::min(gld->getMaxL(), seedPos + seedLen);
- for (int fragLen = minL; fragLen <= maxL; fragLen++) {
- pfpos = seedPos - (fragLen - seedLen);
- effL = std::min(fullLen, totLen - fragLen + 1);
- factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
- value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
- }
- }
-
- mw[i] = 1.0 - value;
-
- if (mw[i] < 1e-8) {
- // fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i);
- mw[i] = 0.0;
- }
- }
}
#endif /* SINGLEQMODEL_H_ */