From: Bo Li Date: Thu, 27 Mar 2014 23:07:42 +0000 (-0500) Subject: Modified the transcript extraction behavior of 'rsem-prepare-reference'. For transcr... X-Git-Url: https://git.donarmstrong.com/?p=rsem.git;a=commitdiff_plain;h=0f8fbc39759d8a8a4fa755903b8bde403b0da6c9 Modified the transcript extraction behavior of 'rsem-prepare-reference'. For transcripts that cannot be extracted, instead of failing the whole script, warning information is produced. Those transcripts are ignored --- diff --git a/Transcripts.h b/Transcripts.h index 5c17f95..033669b 100644 --- a/Transcripts.h +++ b/Transcripts.h @@ -16,7 +16,6 @@ #include "my_assert.h" #include "Transcript.h" - class Transcripts { public: Transcripts(int type = 0) { @@ -29,6 +28,14 @@ public: int getM() { return M; } + // used in shrinking the transcripts + void setM(int M) { this->M = M; transcripts.resize(M + 1); } + + void move(int from, int to) { + assert(from >= to); + if (from > to) transcripts[to] = transcripts[from]; + } + int getType() { return type; } void setType(int type) { this->type = type; } diff --git a/WHAT_IS_NEW b/WHAT_IS_NEW index 8a2a166..2c6f2c7 100644 --- a/WHAT_IS_NEW +++ b/WHAT_IS_NEW @@ -4,6 +4,7 @@ RSEM v1.2.12 - Added '--calc-pme' option for 'rsem-calculate-expression' to calculate posterior mean estimates only (no credibility intervals) - Modified the shebang line of RSEM perl scripts to make them more portable - Added '--seed' option for 'rsem-simulate-reads' to enable users set the seed of random number generator used by the simulation +- Modified the transcript extraction behavior of 'rsem-prepare-reference'. For transcripts that cannot be extracted, instead of failing the whole script, warning information is produced. Those transcripts are ignored -------------------------------------------------------------------------------------------- diff --git a/extractRef.cpp b/extractRef.cpp index 2a26bce..24e17b5 100644 --- a/extractRef.cpp +++ b/extractRef.cpp @@ -9,6 +9,7 @@ #include #include "utils.h" +#include "my_assert.h" #include "GTFItem.h" #include "Transcript.h" #include "Transcripts.h" @@ -53,10 +54,7 @@ void loadMappingInfo(char* mappingF) { ifstream fin(mappingF); string line, key, value; - if (!fin.is_open()) { - fprintf(stderr, "Cannot open %s! It may not exist.\n", mappingF); - exit(-1); - } + general_assert(fin.is_open(), "Cannot open " + cstrtos(mappingF) + "! It may not exist."); mi_table.clear(); while (getline(fin, line)) { @@ -86,14 +84,8 @@ bool buildTranscript(int sp, int ep) { int start = items[i].getStart(); int end = items[i].getEnd(); - if (strand != items[i].getStrand()) { - fprintf(stderr, "According to the GTF file given, a transcript has exons from different orientations!\n"); - exit(-1); - } - if (seqname != items[i].getSeqName()) { - fprintf(stderr, "According to the GTF file given, a transcript has exons on multiple chromosomes!\n"); - exit(-1); - } + general_assert(strand == items[i].getStrand(), "According to the GTF file given, a transcript has exons from different orientations!"); + general_assert(seqname == items[i].getSeqName(), "According to the GTF file given, a transcript has exons on multiple chromosomes!"); if (cur_e + 1 < start) { if (cur_s > 0) vec.push_back(Interval(cur_s, cur_e)); @@ -110,10 +102,10 @@ bool buildTranscript(int sp, int ep) { void parse_gtf_file(char* gtfF) { ifstream fin(gtfF); - string line, curgid, tid, gid; // curgid: current gene id; + string line, tid, gid; GTFItem item; - if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", gtfF); exit(-1); } + general_assert(fin.is_open(), "Cannot open " + cstrtos(gtfF) + "! It may not exist."); int cnt = 0; @@ -124,22 +116,18 @@ void parse_gtf_file(char* gtfF) { string feature = item.getFeature(); if (feature == "exon") { if (item.getStart() > item.getEnd()) { - fprintf(stderr, "Warning: exon's start position is larger than its end position! This exon is discarded.\n"); - fprintf(stderr, "\t%s\n\n", line.c_str()); + printf("Warning: exon's start position is larger than its end position! This exon is discarded.\n"); + printf("\t%s\n\n", line.c_str()); } else if (item.getStart() < 1) { - fprintf(stderr, "Warning: exon's start position is less than 1! This exon is discarded.\n"); - fprintf(stderr, "\t%s\n\n", line.c_str()); + printf("Warning: exon's start position is less than 1! This exon is discarded.\n"); + printf("\t%s\n\n", line.c_str()); } else { if (hasMappingFile) { tid = item.getTranscriptID(); mi_iter = mi_table.find(tid); - if (mi_iter == mi_table.end()) { - fprintf(stderr, "Mapping Info is not correct, cannot find %s's gene_id!\n", tid.c_str()); - exit(-1); - } - //assert(iter != table.end()); + general_assert(mi_iter != mi_table.end(), "Mapping Info is not correct, cannot find " + tid + "'s gene_id!"); gid = mi_iter->second; item.setGeneID(gid); } @@ -154,16 +142,12 @@ void parse_gtf_file(char* gtfF) { sort(items.begin(), items.end()); - starts.clear(); - sn2tr.clear(); - curgid = ""; - int sp = 0, ep; // start pointer, end pointer int nItems = items.size(); + sn2tr.clear(); while (sp < nItems) { tid = items[sp].getTranscriptID(); - gid = items[sp].getGeneID(); ep = sp + 1; while (ep < nItems && items[ep].getTranscriptID() == tid) ep++; @@ -174,10 +158,6 @@ void parse_gtf_file(char* gtfF) { int sid = transcripts.getM(); const Transcript& transcript = transcripts.getTranscriptAt(sid); - if (curgid != gid) { - starts.push_back(sid); - curgid = gid; - } iter = sn2tr.find(transcript.getSeqName()); if (iter == sn2tr.end()) { vector vec(1, sid); @@ -190,26 +170,54 @@ void parse_gtf_file(char* gtfF) { sp = ep + 1; } - M = transcripts.getM(); - starts.push_back(M + 1); items.clear(); - if (M < 1) { - fprintf(stderr, "Number of transcripts in the reference is less than 1!\n"); - exit(-1); - } + M = transcripts.getM(); + general_assert(M > 0, "The reference contains no transcripts!"); if (verbose) { printf("Parsing gtf File is done!\n"); } } char check(char c) { - if (!isalpha(c)) { fprintf(stderr, "Sequence contains unknown letter '%c'!\n", c); exit(-1); } - //assert(isalpha(c)); + general_assert(isalpha(c), "Sequence contains unknown letter '" + ctos(c) + "'!"); if (isupper(c) && c != 'A' && c != 'C' && c != 'G' && c != 'T') c = 'N'; if (islower(c) && c != 'a' && c != 'c' && c != 'g' && c != 't') c = 'n'; return c; } +void shrink() { + int curp = 0; + + for (int i = 1; i <= M; i++) + if (seqs[i] == "") { + const Transcript& transcript = transcripts.getTranscriptAt(i); + printf("Warning: Cannot extract transcript %s's sequence since the chromosome it locates, %s, is absent!\n", transcript.getTranscriptID().c_str(), transcript.getSeqName().c_str()); + } + else { + ++curp; + transcripts.move(i, curp); + if (i > curp) seqs[curp] = seqs[i]; + } + + printf("%d transcripts are extracted and %d transcripts are omitted.\n", curp, M - curp); + + transcripts.setM(curp); + M = transcripts.getM(); + general_assert(M > 0, "The reference contains no transcripts!"); + + starts.clear(); + string curgid = "", gid; + + for (int i = 1; i <= M; i++) { + gid = transcripts.getTranscriptAt(i).getGeneID(); + if (curgid != gid) { + starts.push_back(i); + curgid = gid; + } + } + starts.push_back(M + 1); +} + void writeResults(char* refName) { int s; ofstream fout; @@ -268,7 +276,7 @@ int main(int argc, char* argv[]) { int start = hasMappingFile ? 6 : 5; for (int i = start; i < argc; i++) { fin.open(argv[i]); - if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", argv[i]); exit(-1); } + general_assert(fin.is_open(), "Cannot open " + cstrtos(argv[i]) + "! It may not exist."); getline(fin, line); while ((fin) && (line[0] == '>')) { istringstream strin(line.substr(1)); @@ -299,19 +307,9 @@ int main(int argc, char* argv[]) { if (verbose) { printf("%s is processed!\n", argv[i]); } } - - for (int i = 1; i <= M; i++) { - if (seqs[i] == "") { - const Transcript& transcript = transcripts.getTranscriptAt(i); - - fprintf(stderr, "Cannot extract transcript %s's sequence from chromosome %s! Loading chromosome %s's sequence is failed. Please check if 1) the chromosome directory is set correctly; 2) the list of chromosome files is complete; 3) the FASTA files containing chromosome sequences are not truncated or having wrong format.\n", \ - transcript.getTranscriptID().c_str(), transcript.getSeqName().c_str(), transcript.getSeqName().c_str()); - exit(-1); - } - } - sort(chrvec.begin(), chrvec.end()); + shrink(); if (verbose) { printf("Extracting sequences is done!\n"); } writeResults(argv[1]); diff --git a/makefile b/makefile index aec4b2d..76e1ac3 100644 --- a/makefile +++ b/makefile @@ -12,7 +12,7 @@ Transcript.h : utils.h Transcripts.h : my_assert.h Transcript.h -rsem-extract-reference-transcripts : utils.h GTFItem.h Transcript.h Transcripts.h extractRef.cpp +rsem-extract-reference-transcripts : utils.h my_assert.h GTFItem.h Transcript.h Transcripts.h extractRef.cpp $(CC) -Wall -O3 extractRef.cpp -o rsem-extract-reference-transcripts rsem-synthesis-reference-transcripts : utils.h my_assert.h Transcript.h Transcripts.h synthesisRef.cpp