X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=parseIt.cpp;h=e05585d43f5a6a897f874f90506a8add3bc3e571;hb=90df7a1408511063de96e29658ffa289d43cc0bb;hp=c266715b31693686f7db773155d01e7905321102;hpb=a95154919f950f86de9104b2b9dcf1f0c7e83387;p=rsem.git diff --git a/parseIt.cpp b/parseIt.cpp index c266715..e05585d 100644 --- a/parseIt.cpp +++ b/parseIt.cpp @@ -8,11 +8,15 @@ #include #include #include +#include #include "utils.h" #include "GroupInfo.h" +#include "RefSeq.h" +#include "Refs.h" + #include "SingleRead.h" #include "SingleReadQ.h" #include "PairedEndRead.h" @@ -30,9 +34,11 @@ int N[3]; // note, N = N0 + N1 + N2 , but may not be equal to the total number o int nHits; // # of hits int nUnique, nMulti, nIsoMulti; char fn_list[STRLEN]; -char groupF[STRLEN]; +char refF[STRLEN], groupF[STRLEN]; +char imdName[STRLEN]; char datF[STRLEN], cntF[STRLEN]; +Refs refs; GroupInfo gi; SamParser *parser; @@ -42,14 +48,14 @@ int n_os; // number of ostreams ostream *cat[3][2]; // cat : category 1-dim 0 N0 1 N1 2 N2; 2-dim 0 mate1 1 mate2 char readOutFs[3][2][STRLEN]; -void init(const char* imdName, char alignFType, const char* alignF) { +map counter; +map::iterator iter; - sprintf(datF, "%s.dat", imdName); - sprintf(cntF, "%s.cnt", imdName); +void init(const char* imdName, char alignFType, const char* alignF) { char* aux = 0; if (strcmp(fn_list, "")) aux = fn_list; - parser = new SamParser(alignFType, alignF, aux); + parser = new SamParser(alignFType, alignF, refs, aux); memset(cat, 0, sizeof(cat)); memset(readOutFs, 0, sizeof(readOutFs)); @@ -64,6 +70,8 @@ void init(const char* imdName, char alignFType, const char* alignF) { for (int j = 0; j < n_os; j++) cat[i][j] = new ofstream(readOutFs[i][j]); } + + counter.clear(); } //Do not allow duplicate for unalignable reads and supressed reads in SAM input @@ -88,11 +96,22 @@ void parseIt(SamParser *parser) { if (record_val >= 0) { record_read.write(n_os, cat[record_val]); ++N[record_val]; + } + // flush out previous read's hits if the read is alignable reads + if (record_val == 1) { hits.updateRI(); nHits += hits.getNHits(); nMulti += hits.calcNumGeneMultiReads(gi); nIsoMulti += hits.calcNumIsoformMultiReads(); hits.write(hit_out); + + iter = counter.find(hits.getNHits()); + if (iter != counter.end()) { + iter->second++; + } + else { + counter[hits.getNHits()] = 1; + } } hits.clear(); @@ -111,11 +130,22 @@ void parseIt(SamParser *parser) { if (record_val >= 0) { record_read.write(n_os, cat[record_val]); ++N[record_val]; + } + + if (record_val == 1) { hits.updateRI(); nHits += hits.getNHits(); nMulti += hits.calcNumGeneMultiReads(gi); nIsoMulti += hits.calcNumIsoformMultiReads(); hits.write(hit_out); + + iter = counter.find(hits.getNHits()); + if (iter != counter.end()) { + iter->second++; + } + else { + counter[hits.getNHits()] = 1; + } } nUnique = N[1] - nMulti; @@ -138,15 +168,15 @@ void release() { int main(int argc, char* argv[]) { bool quiet = false; - if (argc < 5) { - printf("Usage : rsem-parse-alignments refName imdName alignFType('s' for sam, 'b' for bam) alignF [-t Type] [-l fn_list] [-tag tagName] [-q]\n"); + if (argc < 6) { + printf("Usage : rsem-parse-alignments refName sampleName sampleToken alignFType('s' for sam, 'b' for bam) alignF [-t Type] [-l fn_list] [-tag tagName] [-q]\n"); exit(-1); } strcpy(fn_list, ""); read_type = 0; - if (argc > 5) { - for (int i = 5; i < argc; i++) { + if (argc > 6) { + for (int i = 6; i < argc; i++) { if (!strcmp(argv[i], "-t")) { read_type = atoi(argv[i + 1]); } @@ -162,11 +192,17 @@ int main(int argc, char* argv[]) { verbose = !quiet; - init(argv[2], argv[3][0], argv[4]); - + sprintf(refF, "%s.seq", argv[1]); + refs.loadRefs(refF, 1); sprintf(groupF, "%s.grp", argv[1]); gi.load(groupF); + sprintf(imdName, "%s.temp/%s", argv[2], argv[3]); + sprintf(datF, "%s.dat", imdName); + sprintf(cntF, "%s.stat/%s.cnt", argv[2], argv[3]); + + init(imdName, argv[4][0], argv[5]); + hit_out.open(datF); string firstLine(59, ' '); @@ -190,6 +226,11 @@ int main(int argc, char* argv[]) { fout<first<<'\t'<second<