#include<string>
#include<algorithm>
#include<sstream>
+#include<iostream>
+#include<vector>
#include "utils.h"
+#include "my_assert.h"
#include "Orientation.h"
#include "LenDist.h"
#include "RSPD.h"
mw = NULL;
if (isMaster) {
- ori = new Orientation(params.probF);
if (!estRSPD) rspd = new RSPD(estRSPD);
qd = new QualDist();
mld = new LenDist(params.mate_minL, params.mate_maxL);
}
+ ori = new Orientation(params.probF);
gld = new LenDist(params.minL, params.maxL);
if (estRSPD) rspd = new RSPD(estRSPD, params.B);
qpro = new QProfile();
int fpos = (dir == 0 ? pos : totLen - pos - insertLen); // the aligned position reported in SAM file, should be a coordinate in forward strand
int effL = std::min(fullLen, totLen - insertLen + 1);
- assert(fpos >= 0 && fpos + insertLen <= totLen && insertLen <= totLen);
+ general_assert(fpos >= 0, "The alignment of fragment " + read.getName() + " to transcript " + itos(sid) + " starts at " + itos(fpos) + \
+ " from the forward direction, which should be a non-negative number! " + \
+ "It is possible that the aligner you use gave different read lengths for a same read in SAM file.");
+ general_assert(fpos + insertLen <= totLen,"Fragment " + read.getName() + " is hung over the end of transcript " + itos(sid) + "! " \
+ + "It is possible that the aligner you use gave different read lengths for a same read in SAM file.");
+ general_assert(insertLen <= totLen, "Fragment " + read.getName() + " has length " + itos(insertLen) + ", but it is aligned to transcript " \
+ + itos(sid) + ", whose length (" + itos(totLen) + ") is shorter than the fragment's length!");
+
if (fpos >= fullLen || ref.getMask(fpos)) return 0.0; // For paired-end model, fpos is the seedPos
prob = ori->getProb(dir) * gld->getAdjustedProb(insertLen, totLen) *
const LenDist& getGLD() { return *gld; }
- void startSimulation(simul*, double*);
- bool simulate(int, PairedEndReadQ&, int&);
+ void startSimulation(simul*, const std::vector<double>&);
+ bool simulate(READ_INT_TYPE, PairedEndReadQ&, int&);
void finishSimulation();
//Use it after function 'read' or 'estimateFromReads'
- double* getMW() {
+ const double* getMW() {
assert(mw != NULL);
return mw;
}
int getModelType() const { return model_type; }
-
- bool simulate(int, PairedEndReadQ&, SingleReadQ&, int&);
private:
static const int model_type = 3;
static const int read_type = 3;
int M;
- int N[3];
+ READ_INT_TYPE N[3];
Refs *refs;
int seedLen;
for (int i = 0; i < 3; i++)
if (N[i] > 0) {
genReadFileNames(readFN, i, read_type, s, readFs);
- ReadReader<PairedEndReadQ> reader(s, readFs);
+ ReadReader<PairedEndReadQ> reader(s, readFs, refs->hasPolyA(), seedLen); // allow calculation of calc_lq() function
- int cnt = 0;
+ READ_INT_TYPE cnt = 0;
while (reader.next(read)) {
SingleReadQ mate1 = read.getMate1();
SingleReadQ mate2 = read.getMate2();
- mld->update(mate1.getReadLength(), 1.0);
- mld->update(mate2.getReadLength(), 1.0);
-
- qd->update(mate1.getQScore());
- qd->update(mate2.getQScore());
-
- if (i == 0) {
- nqpro->updateC(mate1.getReadSeq(), mate1.getQScore());
- nqpro->updateC(mate2.getReadSeq(), mate2.getQScore());
+ if (!read.isLowQuality()) {
+ mld->update(mate1.getReadLength(), 1.0);
+ mld->update(mate2.getReadLength(), 1.0);
+
+ qd->update(mate1.getQScore());
+ qd->update(mate2.getQScore());
+
+ if (i == 0) {
+ nqpro->updateC(mate1.getReadSeq(), mate1.getQScore());
+ nqpro->updateC(mate2.getReadSeq(), mate2.getQScore());
+ }
+ }
+ else if (verbose && (mate1.getReadLength() < seedLen || mate2.getReadLength() < seedLen)) {
+ std::cout<< "Warning: Read "<< read.getName()<< " is ignored due to at least one of the mates' length < seed length "<< seedLen<< "!"<< std::endl;
}
++cnt;
- if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); }
+ if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " READS PROCESSED"<< std::endl; }
}
- if (verbose) { printf("estimateFromReads, N%d finished.\n", i); }
+ if (verbose) { std::cout<<"estimateFromReads, N"<< i<<" finished."<< std::endl; }
}
mld->finish();
FILE *fi = fopen(inpF, "r");
if (fi == NULL) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
- fscanf(fi, "%d", &val);
+ assert(fscanf(fi, "%d", &val) == 1);
assert(val == model_type);
ori->read(fi);
qpro->read(fi);
nqpro->read(fi);
- if (fscanf(fi, "%d", &M) == 1) {
- mw = new double[M + 1];
- for (int i = 0; i <= M; i++) fscanf(fi, "%lf", &mw[i]);
+ if (fscanf(fi, "%d", &val) == 1) {
+ if (M == 0) M = val;
+ if (M == val) {
+ mw = new double[M + 1];
+ for (int i = 0; i <= M; i++) assert(fscanf(fi, "%lf", &mw[i]) == 1);
+ }
}
+
fclose(fi);
}
-//Only master node can call
+//Only master node can call. Only be called at EM.cpp
void PairedEndQModel::write(const char* outF) {
FILE *fo = fopen(outF, "w");
fclose(fo);
}
-void PairedEndQModel::startSimulation(simul* sampler, double* theta) {
+void PairedEndQModel::startSimulation(simul* sampler, const std::vector<double>& theta) {
this->sampler = sampler;
theta_cdf = new double[M + 1];
nqpro->startSimulation();
}
-bool PairedEndQModel::simulate(int rid, PairedEndReadQ& read, int& sid) {
+bool PairedEndQModel::simulate(READ_INT_TYPE rid, PairedEndReadQ& read, int& sid) {
int dir, pos;
int insertL, mateL1, mateL2;
std::string name;
readseq2 = qpro->simulate(sampler, mateL2, m2pos, m2dir, qual2, ref);
}
- std::ostringstream stdout;
- stdout<<rid<<"_"<<dir<<"_"<<sid<<"_"<<pos<<"_"<<insertL;
- name = stdout.str();
+ strout<<rid<<"_"<<dir<<"_"<<sid<<"_"<<pos<<"_"<<insertL;
+ name = strout.str();
read = PairedEndReadQ(SingleReadQ(name + "/1", readseq1, qual1), SingleReadQ(name + "/2", readseq2, qual2));
void PairedEndQModel::calcMW() {
- assert(seedLen >= OLEN && mld->getMinL() >= seedLen);
-
- memset(mw, 0, sizeof(double) * (M + 1));
- mw[0] = 1.0;
-
- for (int i = 1; i <= M; i++) {
- RefSeq& ref = refs->getRef(i);
- int totLen = ref.getTotLen();
- int fullLen = ref.getFullLen();
- int end = std::min(fullLen, totLen - gld->getMinL() + 1);
- double value = 0.0;
- int minL, maxL;
- int effL, pfpos;
-
- //seedPos is fpos here
- for (int seedPos = 0; seedPos < end; seedPos++)
- if (ref.getMask(seedPos)) {
- minL = gld->getMinL();
- maxL = std::min(gld->getMaxL(), totLen - seedPos);
- pfpos = seedPos;
- for (int fragLen = minL; fragLen <= maxL; fragLen++) {
- effL = std::min(fullLen, totLen - fragLen + 1);
- value += gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen);
- }
- }
+ assert(mld->getMinL() >= seedLen);
+
+ memset(mw, 0, sizeof(double) * (M + 1));
+ mw[0] = 1.0;
+
+ for (int i = 1; i <= M; i++) {
+ RefSeq& ref = refs->getRef(i);
+ int totLen = ref.getTotLen();
+ int fullLen = ref.getFullLen();
+ int end = std::min(fullLen, totLen - gld->getMinL() + 1);
+ double value = 0.0;
+ int minL, maxL;
+ int effL, pfpos;
+
+ //seedPos is fpos here
+ for (int seedPos = 0; seedPos < end; seedPos++)
+ if (ref.getMask(seedPos)) {
+ minL = gld->getMinL();
+ maxL = std::min(gld->getMaxL(), totLen - seedPos);
+ pfpos = seedPos;
+ for (int fragLen = minL; fragLen <= maxL; fragLen++) {
+ effL = std::min(fullLen, totLen - fragLen + 1);
+ value += gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen);
+ }
+ }
- mw[i] = 1.0 - value;
+ mw[i] = 1.0 - value;
- if (mw[i] < 1e-8) {
- // fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i);
- mw[i] = 0.0;
- }
- }
+ if (mw[i] < 1e-8) {
+ // fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i);
+ mw[i] = 0.0;
+ }
+ }
}
#endif /* PAIREDENDQMODEL_H_ */