X-Git-Url: https://git.donarmstrong.com/?p=rsem.git;a=blobdiff_plain;f=extractRef.cpp;h=24e17b5ec08b55310c123ebd44e079439fdbb0c8;hp=693af6635ae9df85006dcc7b991ee5d2e497bed8;hb=83ce658a4b9c5f04c081316314b66b94ad5ffbde;hpb=37513308b7557ae65899e59df8e43b5dec90da5c diff --git a/extractRef.cpp b/extractRef.cpp index 693af66..24e17b5 100644 --- a/extractRef.cpp +++ b/extractRef.cpp @@ -9,6 +9,7 @@ #include #include "utils.h" +#include "my_assert.h" #include "GTFItem.h" #include "Transcript.h" #include "Transcripts.h" @@ -50,24 +51,21 @@ map mi_table; // mapping info table map::iterator mi_iter; //mapping info table's iterator void loadMappingInfo(char* mappingF) { - ifstream fin(mappingF); - string line, key, value; - - if (!fin.is_open()) { - fprintf(stderr, "Cannot open %s! It may not exist.\n", mappingF); - exit(-1); - } - - mi_table.clear(); - while (getline(fin, line)) { - line = cleanStr(line); - if (line[0] == '#') continue; - istringstream strin(line); - strin>>value>>key; - mi_table[key] = value; - } + ifstream fin(mappingF); + string line, key, value; + + general_assert(fin.is_open(), "Cannot open " + cstrtos(mappingF) + "! It may not exist."); + + mi_table.clear(); + while (getline(fin, line)) { + line = cleanStr(line); + if (line[0] == '#') continue; + istringstream strin(line); + strin>>value>>key; + mi_table[key] = value; + } - fin.close(); + fin.close(); } bool buildTranscript(int sp, int ep) { @@ -86,14 +84,8 @@ bool buildTranscript(int sp, int ep) { int start = items[i].getStart(); int end = items[i].getEnd(); - if (strand != items[i].getStrand()) { - fprintf(stderr, "According to the GTF file given, a transcript has exons from different orientations!\n"); - exit(-1); - } - if (seqname != items[i].getSeqName()) { - fprintf(stderr, "According to the GTF file given, a transcript has exons on multiple chromosomes!\n"); - exit(-1); - } + general_assert(strand == items[i].getStrand(), "According to the GTF file given, a transcript has exons from different orientations!"); + general_assert(seqname == items[i].getSeqName(), "According to the GTF file given, a transcript has exons on multiple chromosomes!"); if (cur_e + 1 < start) { if (cur_s > 0) vec.push_back(Interval(cur_s, cur_e)); @@ -110,10 +102,10 @@ bool buildTranscript(int sp, int ep) { void parse_gtf_file(char* gtfF) { ifstream fin(gtfF); - string line, curgid, tid, gid; // curgid: current gene id; + string line, tid, gid; GTFItem item; - if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", gtfF); exit(-1); } + general_assert(fin.is_open(), "Cannot open " + cstrtos(gtfF) + "! It may not exist."); int cnt = 0; @@ -124,24 +116,20 @@ void parse_gtf_file(char* gtfF) { string feature = item.getFeature(); if (feature == "exon") { if (item.getStart() > item.getEnd()) { - fprintf(stderr, "Warning: exon's start position is larger than its end position! This exon is discarded.\n"); - fprintf(stderr, "\t%s\n\n", line.c_str()); + printf("Warning: exon's start position is larger than its end position! This exon is discarded.\n"); + printf("\t%s\n\n", line.c_str()); } else if (item.getStart() < 1) { - fprintf(stderr, "Warning: exon's start position is less than 1! This exon is discarded.\n"); - fprintf(stderr, "\t%s\n\n", line.c_str()); + printf("Warning: exon's start position is less than 1! This exon is discarded.\n"); + printf("\t%s\n\n", line.c_str()); } else { if (hasMappingFile) { tid = item.getTranscriptID(); - mi_iter = mi_table.find(tid); - if (mi_iter == mi_table.end()) { - fprintf(stderr, "Mapping Info is not correct, cannot find %s's gene_id!\n", tid.c_str()); - exit(-1); - } - //assert(iter != table.end()); - gid = mi_iter->second; - item.setGeneID(gid); + mi_iter = mi_table.find(tid); + general_assert(mi_iter != mi_table.end(), "Mapping Info is not correct, cannot find " + tid + "'s gene_id!"); + gid = mi_iter->second; + item.setGeneID(gid); } items.push_back(item); } @@ -154,16 +142,12 @@ void parse_gtf_file(char* gtfF) { sort(items.begin(), items.end()); - starts.clear(); - sn2tr.clear(); - curgid = ""; - int sp = 0, ep; // start pointer, end pointer int nItems = items.size(); + sn2tr.clear(); while (sp < nItems) { tid = items[sp].getTranscriptID(); - gid = items[sp].getGeneID(); ep = sp + 1; while (ep < nItems && items[ep].getTranscriptID() == tid) ep++; @@ -174,10 +158,6 @@ void parse_gtf_file(char* gtfF) { int sid = transcripts.getM(); const Transcript& transcript = transcripts.getTranscriptAt(sid); - if (curgid != gid) { - starts.push_back(sid); - curgid = gid; - } iter = sn2tr.find(transcript.getSeqName()); if (iter == sn2tr.end()) { vector vec(1, sid); @@ -190,26 +170,54 @@ void parse_gtf_file(char* gtfF) { sp = ep + 1; } - M = transcripts.getM(); - starts.push_back(M + 1); items.clear(); - if (M < 1) { - fprintf(stderr, "Number of transcripts in the reference is less than 1!\n"); - exit(-1); - } + M = transcripts.getM(); + general_assert(M > 0, "The reference contains no transcripts!"); if (verbose) { printf("Parsing gtf File is done!\n"); } } char check(char c) { - if (!isalpha(c)) { fprintf(stderr, "Sequence contains unknown letter '%c'!\n", c); exit(-1); } - //assert(isalpha(c)); + general_assert(isalpha(c), "Sequence contains unknown letter '" + ctos(c) + "'!"); if (isupper(c) && c != 'A' && c != 'C' && c != 'G' && c != 'T') c = 'N'; if (islower(c) && c != 'a' && c != 'c' && c != 'g' && c != 't') c = 'n'; return c; } +void shrink() { + int curp = 0; + + for (int i = 1; i <= M; i++) + if (seqs[i] == "") { + const Transcript& transcript = transcripts.getTranscriptAt(i); + printf("Warning: Cannot extract transcript %s's sequence since the chromosome it locates, %s, is absent!\n", transcript.getTranscriptID().c_str(), transcript.getSeqName().c_str()); + } + else { + ++curp; + transcripts.move(i, curp); + if (i > curp) seqs[curp] = seqs[i]; + } + + printf("%d transcripts are extracted and %d transcripts are omitted.\n", curp, M - curp); + + transcripts.setM(curp); + M = transcripts.getM(); + general_assert(M > 0, "The reference contains no transcripts!"); + + starts.clear(); + string curgid = "", gid; + + for (int i = 1; i <= M; i++) { + gid = transcripts.getTranscriptAt(i).getGeneID(); + if (curgid != gid) { + starts.push_back(i); + curgid = gid; + } + } + starts.push_back(M + 1); +} + void writeResults(char* refName) { int s; ofstream fout; @@ -260,7 +268,6 @@ int main(int argc, char* argv[]) { ifstream fin; string line, gseq, seqname; - void* pt; chrvec.clear(); @@ -269,47 +276,40 @@ int main(int argc, char* argv[]) { int start = hasMappingFile ? 6 : 5; for (int i = start; i < argc; i++) { fin.open(argv[i]); - if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", argv[i]); exit(-1); } - pt = getline(fin, line); - while (pt != 0 && line[0] == '>') { + general_assert(fin.is_open(), "Cannot open " + cstrtos(argv[i]) + "! It may not exist."); + getline(fin, line); + while ((fin) && (line[0] == '>')) { istringstream strin(line.substr(1)); strin>>seqname; gseq = ""; - while((pt = getline(fin, line)) && line[0] != '>') { - gseq += line; - } - - size_t len = gseq.length(); - assert(len > 0); - for (size_t j = 0; j < len; j++) gseq[j] = check(gseq[j]); - - iter = sn2tr.find(seqname); - if (iter == sn2tr.end()) continue; - - chrvec.push_back(ChrInfo(seqname, len)); - - vector& vec = iter->second; - int s = vec.size(); - for (int j = 0; j < s; j++) { - assert(vec[j] > 0 && vec[j] <= M); - transcripts.getTranscriptAt(vec[j]).extractSeq(gseq, seqs[vec[j]]); - } + while((getline(fin, line)) && (line[0] != '>')) { + gseq += line; + } + + size_t len = gseq.length(); + assert(len > 0); + for (size_t j = 0; j < len; j++) gseq[j] = check(gseq[j]); + + iter = sn2tr.find(seqname); + if (iter == sn2tr.end()) continue; + + chrvec.push_back(ChrInfo(seqname, len)); + + vector& vec = iter->second; + int s = vec.size(); + for (int j = 0; j < s; j++) { + assert(vec[j] > 0 && vec[j] <= M); + transcripts.getTranscriptAt(vec[j]).extractSeq(gseq, seqs[vec[j]]); + } } fin.close(); if (verbose) { printf("%s is processed!\n", argv[i]); } } - - for (int i = 1; i <= M; i++) { - if (seqs[i] == "") { - fprintf(stderr, "%s's sequence is empty! You must provide all chromosome files of transcripts which are presented in the .gtf file!\n", transcripts.getTranscriptAt(i).getTranscriptID().c_str()); - exit(-1); - } - } - sort(chrvec.begin(), chrvec.end()); + shrink(); if (verbose) { printf("Extracting sequences is done!\n"); } writeResults(argv[1]);