From a1dc3c6c442d288c7f4218cbb8b7a54f08e9d980 Mon Sep 17 00:00:00 2001 From: Bo Li Date: Thu, 29 Nov 2012 21:43:49 -0600 Subject: [PATCH] - Added poly(A) tails to 'reference_name.transcripts.fa' so that the RSEM generated transcript unsorted BAM file can be fed into RSEM as an input file. However, users need to rebuild their references if they want to visualize the transcript level wiggle files and BAM files using IGV - Modified 'rsem-tbam2gbam' to convert users' alignments from transcript BAM files into genome BAM files, provided users use 'reference_name.idx.fa' to build indices for their aligners - Updated EBSeq from v1.1.3 to v1.1.4 - Corrected several typos in warning messages --- .gitignore | 2 + BamConverter.h | 96 ++++++++++++++++++++++++------------ BamWriter.h | 20 +------- EM.cpp | 22 ++++----- Gibbs.cpp | 2 +- README.md | 32 +++++++++--- WHAT_IS_NEW | 9 ++++ calcCI.cpp | 2 +- extractRef.cpp | 38 +++++++------- preRef.cpp | 109 ++++++++++++++++++++++------------------- rsem-prepare-reference | 4 +- 11 files changed, 197 insertions(+), 139 deletions(-) diff --git a/.gitignore b/.gitignore index 02f7995..05044a3 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,5 @@ sam/samtools .project .cproject update_doc.sh +EBSeq/EBSeq +EBSeq/blockmodeling diff --git a/BamConverter.h b/BamConverter.h index 320824b..4ea7d3e 100644 --- a/BamConverter.h +++ b/BamConverter.h @@ -53,12 +53,35 @@ BamConverter::BamConverter(const char* inpF, const char* outF, const char* chr_l transcripts.buildMappings(in->header->n_targets, in->header->target_name); bam_header_t *out_header = sam_header_read2(chr_list); + refmap.clear(); for (int i = 0; i < out_header->n_targets; i++) { refmap[out_header->target_name[i]] = i; } - append_header_text(out_header, in->header->text, in->header->l_text); + if (in->header->l_text > 0) { + char comment[] = "@CO\tThis BAM file is processed by rsem-tbam2gam to convert from transcript coordinates into genomic coordinates.\n"; + int comment_len = strlen(comment); + + //Filter @SQ fields if the BAM file is user provided + char *text = in->header->text; + int l_text = in->header->l_text; + char *new_text = new char[l_text + comment_len]; + int pos = 0, s = 0; + while (pos < l_text) { + if ((pos + 2 < l_text) && (text[pos] == '@') && (text[pos + 1] == 'S') && (text[pos + 2] == 'Q')) { + pos += 3; + while (pos < l_text && text[pos] != '\n') ++pos; + } + else new_text[s++] = text[pos]; + ++pos; + } + strncpy(new_text + s, comment, comment_len); + s += comment_len; + + append_header_text(out_header, new_text, s); + delete[] new_text; + } out = samopen(outF, "wb", out_header); assert(out != 0); @@ -101,35 +124,43 @@ void BamConverter::process() { if (!notgood) { - if (isPaired) assert((b->core.tid == b2->core.tid) && (b->core.flag & 0x0040) && (b2->core.flag & 0x0080)); // for collapsing - - const Transcript& transcript = transcripts.getTranscriptViaEid(b->core.tid + 1); - - convert(b, transcript); - if (isPaired) { - convert(b2, transcript); - b->core.mpos = b2->core.pos; - b2->core.mpos = b->core.pos; - } - - if (cqname != bam1_qname(b)) { - writeCollapsedLines(); - cqname = bam1_qname(b); - collapseMap.init(isPaired); - } - collapseMap.insert(b, b2, bam_aux2f(bam_aux_get(b, "ZW"))); + // for collapsing + if (isPaired) { + assert(b->core.tid == b2->core.tid); + if ((b->core.flag & 0x0080) && (b2->core.flag & 0x0040)) { + bam1_t *tmp = b; b = b2; b2 = tmp; + } + } + + const Transcript& transcript = transcripts.getTranscriptViaEid(b->core.tid + 1); + + convert(b, transcript); + if (isPaired) { + convert(b2, transcript); + b->core.mpos = b2->core.pos; + b2->core.mpos = b->core.pos; + } + + if (cqname != bam1_qname(b)) { + writeCollapsedLines(); + cqname = bam1_qname(b); + collapseMap.init(isPaired); + } + + uint8_t *p = bam_aux_get(b, "ZW"); + float prb = (p != NULL? bam_aux2f(p) : 1.0); + collapseMap.insert(b, b2, prb); } else { - assert(cqname != bam1_qname(b)); + assert(cqname != bam1_qname(b)); - writeCollapsedLines(); - cqname = bam1_qname(b); - collapseMap.init(isPaired); + writeCollapsedLines(); + cqname = bam1_qname(b); + collapseMap.init(isPaired); - samwrite(out, b); - if (isPaired) samwrite(out, b2); + samwrite(out, b); + if (isPaired) samwrite(out, b2); } - } writeCollapsedLines(); @@ -187,16 +218,21 @@ inline void BamConverter::writeCollapsedLines() { bam1_t *tmp_b = NULL,*tmp_b2 = NULL; float prb; bool isPaired; + uint8_t *p; if (!collapseMap.empty(isPaired)) { while (collapseMap.next(tmp_b, tmp_b2, prb)) { - memcpy(bam_aux_get(tmp_b, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f')); - tmp_b->core.qual = getMAPQ(prb); + p = bam_aux_get(tmp_b, "ZW"); + if (p != NULL) { + memcpy(bam_aux_get(tmp_b, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f')); + tmp_b->core.qual = getMAPQ(prb); + } + else tmp_b->core.qual = getMAPQ(1.0); samwrite(out, tmp_b); if (isPaired) { - memcpy(bam_aux_get(tmp_b2, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f')); - tmp_b2->core.qual = tmp_b->core.qual; - samwrite(out, tmp_b2); + if (p != NULL) memcpy(bam_aux_get(tmp_b2, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f')); + tmp_b2->core.qual = tmp_b->core.qual; + samwrite(out, tmp_b2); } bam_destroy1(tmp_b); if (isPaired) bam_destroy1(tmp_b2); diff --git a/BamWriter.h b/BamWriter.h index 733eaef..a4e7dc5 100644 --- a/BamWriter.h +++ b/BamWriter.h @@ -55,9 +55,6 @@ BamWriter::BamWriter(char inpType, const char* inpF, const char* fn_list, const //generate output's header bam_header_t *out_header = bam_header_dwt(in->header); - for (int i = 0; i < out_header->n_targets; i++) { - out_header->target_len[i] = transcripts.getTranscriptViaEid(i + 1).getLength(); // transcript length without poly(A) tail - } std::ostringstream strout; strout<<"@HD\tVN:1.4\tSO:unknown\n@PG\tID:RSEM\n"; @@ -116,7 +113,7 @@ void BamWriter::work(HitWrapper wrapper) { while (samread(in, b) >= 0 && samread(in, b2) >= 0) { cnt += 2; - if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< "alignment lines are loaded!"<< std::endl; } + if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " alignment lines are loaded!"<< std::endl; } assert((b->core.flag & 0x0001) && (b2->core.flag & 0x0001)); assert(((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)) || ((b->core.flag & 0x0080) && (b2->core.flag & 0x0040))); @@ -140,22 +137,9 @@ void BamWriter::work(HitWrapper wrapper) { convert(b, hit->getConPrb()); convert(b2, hit->getConPrb()); - } - /* - else { - // if only one mate can be aligned, mask it as unaligned and put an additional tag Z0:A:! - char exclamation = '!'; - if (!(b->core.flag & 0x0004)) { - b->core.flag |= 0x0004; - bam_aux_append(b, "Z0", 'A', bam_aux_type2size('A'), (uint8_t*)&exclamation); - } - if (!(b2->core.flag & 0x0004)) { - b2->core.flag |= 0x0004; - bam_aux_append(b2, "Z0", 'A', bam_aux_type2size('A'), (uint8_t*)&exclamation); - } + // omit "b->core.mpos = b2->core.pos; b2->core.mpos = b->core.pos" for the reason that it is possible that only one mate is alignable } - */ samwrite(out, b); samwrite(out, b2); diff --git a/EM.cpp b/EM.cpp index fdc75cc..59783f3 100644 --- a/EM.cpp +++ b/EM.cpp @@ -336,7 +336,7 @@ void calcExpressionValues(const vector& theta, const vector& eel frac[i] = theta[i]; denom += frac[i]; } - general_assert(denom > 0, "No alignable reads?!"); + general_assert(denom >= EPSILON, "No alignable reads?!"); for (int i = 1; i <= M; i++) frac[i] /= denom; //calculate FPKM @@ -384,18 +384,18 @@ void writeResults(ModelType& model, double* counts) { } if (gene_tpm[i] < EPSILON) { - double frac = 1.0 / (e - b); - for (int j = b; j < e; j++) { - glens[i] += tlens[j] * frac; - gene_eels[i] += eel[j] * frac; - } + double frac = 1.0 / (e - b); + for (int j = b; j < e; j++) { + glens[i] += tlens[j] * frac; + gene_eels[i] += eel[j] * frac; + } } else { - for (int j = b; j < e; j++) { - isopct[j] = tpm[j] / gene_tpm[i]; - glens[i] += tlens[j] * isopct[j]; - gene_eels[i] += eel[j] * isopct[j]; - } + for (int j = b; j < e; j++) { + isopct[j] = tpm[j] / gene_tpm[i]; + glens[i] += tlens[j] * isopct[j]; + gene_eels[i] += eel[j] * isopct[j]; + } } } diff --git a/Gibbs.cpp b/Gibbs.cpp index bf8563e..b0287a3 100644 --- a/Gibbs.cpp +++ b/Gibbs.cpp @@ -254,7 +254,7 @@ void calcExpressionValues(const vector& theta, const vector& eel frac[i] = theta[i]; denom += frac[i]; } - general_assert(denom > 0, "No alignable reads?!"); + general_assert(denom >= EPSILON, "No alignable reads?!"); for (int i = 1; i <= M; i++) frac[i] /= denom; //calculate FPKM diff --git a/README.md b/README.md index 1e64574..4b0b3ec 100644 --- a/README.md +++ b/README.md @@ -147,7 +147,27 @@ unsorted BAM file, 'sample_name.genome.sorted.bam' and generated by the samtools included. All these files are in genomic coordinates. -#### a) Generating a Wiggle file +#### a) Converting transcript BAM file into genome BAM file + +Normally, RSEM will do this for you via '--output-genome-bam' option +of 'rsem-calculate-expression'. However, if you have run +'rsem-prepare-reference' and use 'reference_name.idx.fa' to build +indices for your aligner, you can use 'rsem-tbam2gbam' to convert your +transcript coordinate BAM alignments file into a genomic coordinate +BAM alignments file without the need to run the whole RSEM +pipeline. Please note that 'rsem-prepare-reference' will convert all +'N' into 'G' by default for 'reference_name.idx.fa'. If you do not +want this to happen, please use '--no-ntog' option. + +Usage: + + rsem-tbam2gbam reference_name unsorted_transcript_bam_input genome_bam_output + +reference_name : The name of reference built by 'rsem-prepare-reference' +unsorted_transcript_bam_input : This file should satisfy: 1) the alignments of a same read are grouped together, 2) for any paired-end alignment, the two mates should be adjacent to each other, 3) this file should not be sorted by samtools +genome_bam_output : The output genomic coordinate BAM file's name + +#### b) Generating a Wiggle file A wiggle plot representing the expected number of reads overlapping each position in the genome/transcript set can be generated from the @@ -162,9 +182,9 @@ Usage: sorted_bam_input : Input BAM format file, must be sorted wig_output : Output wiggle file's name, e.g. output.wig wiggle_name : the name of this wiggle plot ---no-fractional-weight : If this is set, RSEM will not look for "ZW" tag and each alignment appeared in the BAM file has weight 1. Set this if your BAM file is not generated by RSEM. Please note that this option must be at the end of the command line. +--no-fractional-weight : If this is set, RSEM will not look for "ZW" tag and each alignment appeared in the BAM file has weight 1. Set this if your BAM file is not generated by RSEM. Please note that this option must be at the end of the command line -#### b) Loading a BAM and/or Wiggle file into the UCSC Genome Browser or Integrative Genomics Viewer(IGV) +#### c) Loading a BAM and/or Wiggle file into the UCSC Genome Browser or Integrative Genomics Viewer(IGV) For UCSC genome browser, please refer to the [UCSC custom track help page](http://genome.ucsc.edu/goldenPath/help/customTrack.html). @@ -174,7 +194,7 @@ Here are some guidance for visualizing transcript coordinate files using IGV: 1) Import the transcript sequences as a genome -Select File -> Import Genome, then fill in ID, Name and Fasta file. Fasta file should be 'reference_name.idx.fa'. After that, click Save button. Suppose ID is filled as 'reference_name', a file called 'reference_name.genome' will be generated. Next time, we can use: File -> Load Genome, then select 'reference_name.genome'. +Select File -> Import Genome, then fill in ID, Name and Fasta file. Fasta file should be 'reference_name.transcripts.fa'. After that, click Save button. Suppose ID is filled as 'reference_name', a file called 'reference_name.genome' will be generated. Next time, we can use: File -> Load Genome, then select 'reference_name.genome'. 2) Load visualization files @@ -182,7 +202,7 @@ Select File -> Load from File, then choose one transcript coordinate visualizati igvtools tile reference_name.transcript.wig reference_name.transcript.tdf reference_name.genome -#### c) Generating Transcript Wiggle Plots +#### d) Generating Transcript Wiggle Plots To generate transcript wiggle plots, you should run the 'rsem-plot-transcript-wiggles' program. Run @@ -192,7 +212,7 @@ To generate transcript wiggle plots, you should run the to get usage information or visit the [rsem-plot-transcript-wiggles documentation page](http://deweylab.biostat.wisc.edu/rsem/rsem-plot-transcript-wiggles.html). -#### d) Visualize the model learned by RSEM +#### e) Visualize the model learned by RSEM RSEM provides an R script, 'rsem-plot-model', for visulazing the model learned. diff --git a/WHAT_IS_NEW b/WHAT_IS_NEW index 073de14..31025d8 100644 --- a/WHAT_IS_NEW +++ b/WHAT_IS_NEW @@ -1,3 +1,12 @@ +RSEM v1.2.1 + +- Added poly(A) tails to 'reference_name.transcripts.fa' so that the RSEM generated transcript unsorted BAM file can be fed into RSEM as an input file. However, users need to rebuild their references if they want to visualize the transcript level wiggle files and BAM files using IGV +- Modified 'rsem-tbam2gbam' to convert users' alignments from transcript BAM files into genome BAM files, provided users use 'reference_name.idx.fa' to build indices for their aligners +- Updated EBSeq from v1.1.3 to v1.1.4 +- Corrected several typos in warning messages + +-------------------------------------------------------------------------------------------- + RSEM v1.2.0 - Changed output formats, added FPKM field etc. diff --git a/calcCI.cpp b/calcCI.cpp index dbfe53a..45c7683 100644 --- a/calcCI.cpp +++ b/calcCI.cpp @@ -138,7 +138,7 @@ void* sample_theta_from_c(void* arg) { for (int i = 0; i < nSpC; i++) { double sum = 0.0; for (int j = 0; j <= M; j++) { - theta[j] = ((j == 0 || eel[j] >= EPSILON && mw[j] >= EPSILON) ? (*rgs[j])() / mw[j] : 0.0); + theta[j] = ((j == 0 || (eel[j] >= EPSILON && mw[j] >= EPSILON)) ? (*rgs[j])() / mw[j] : 0.0); sum += theta[j]; } assert(sum >= EPSILON); diff --git a/extractRef.cpp b/extractRef.cpp index 0a46de0..1a56cc1 100644 --- a/extractRef.cpp +++ b/extractRef.cpp @@ -50,24 +50,24 @@ map mi_table; // mapping info table map::iterator mi_iter; //mapping info table's iterator void loadMappingInfo(char* mappingF) { - ifstream fin(mappingF); - string line, key, value; - - if (!fin.is_open()) { - fprintf(stderr, "Cannot open %s! It may not exist.\n", mappingF); - exit(-1); - } - - mi_table.clear(); - while (getline(fin, line)) { - line = cleanStr(line); - if (line[0] == '#') continue; - istringstream strin(line); - strin>>value>>key; - mi_table[key] = value; - } - - fin.close(); + ifstream fin(mappingF); + string line, key, value; + + if (!fin.is_open()) { + fprintf(stderr, "Cannot open %s! It may not exist.\n", mappingF); + exit(-1); + } + + mi_table.clear(); + while (getline(fin, line)) { + line = cleanStr(line); + if (line[0] == '#') continue; + istringstream strin(line); + strin>>value>>key; + mi_table[key] = value; + } + + fin.close(); } bool buildTranscript(int sp, int ep) { @@ -305,7 +305,7 @@ int main(int argc, char* argv[]) { if (seqs[i] == "") { const Transcript& transcript = transcripts.getTranscriptAt(i); fprintf(stderr, "Cannot extract transcript %s's sequence from chromosome %s, whose information might not be provided! Please check if the chromosome directory is set correctly or the list of chromosome files is complete.\n", \ - transcript.getTranscriptID().c_str(), transcript.getGeneID().c_str()); + transcript.getTranscriptID().c_str(), transcript.getSeqName().c_str()); exit(-1); } } diff --git a/preRef.cpp b/preRef.cpp index 65e034b..7518ec4 100644 --- a/preRef.cpp +++ b/preRef.cpp @@ -23,7 +23,7 @@ PolyARules rules; Refs refs; ofstream fout; -char refF[STRLEN], alignerFastaF[STRLEN]; +char refF[STRLEN], alignerFastaF[STRLEN], transF[STRLEN]; int polyAChoice, polyALen; char exceptionF[STRLEN]; @@ -33,54 +33,61 @@ bool quiet; // verbose = !quiet; // always generate references for aligners, default convert all N into G int main(int argc, char* argv[]) { - if (argc < 4) { - printf("USAGE : rsem-preref refFastaF polyAChoice refName [-l polyALen] [-f exceptionF] [--no-ntog] [-q]\n\n"); - printf(" refFastaF: a FASTA format file contains all reference transcripts\n"); - printf(" polyAChoice: choice for polyA tail padding.It is a number from {0,1,2}\n"); - printf(" 0: pad polyA tail\n"); - printf(" 1: do not pad polyA tail at all\n"); - printf(" 2: pad polyA tail for all references but those in exceptionF\n"); - printf(" -l: polyALen: specify the length of polyA tail you want to pad. Default is 100\n"); - printf(" -f: exceptionF: file contains a list of exception reference ids. IDs starts from 1. Must set if polyAChoice = 2\n"); - printf(" --no-ntog: do not convert N in references into G\n"); - printf(" -q: quiet\n"); - exit(-1); - } - - - polyAChoice = atoi(argv[2]); - - polyALen = 125; - ntog = true; - quiet = false; - memset(exceptionF, 0, sizeof(exceptionF)); - - for (int i = 4; i < argc; i++) { - if (!strcmp(argv[i], "-l")) { polyALen = atoi(argv[i + 1]); } - if (!strcmp(argv[i], "-f")) { strcpy(exceptionF, argv[i + 1]); } - if (!strcmp(argv[i], "--no-ntog")) { ntog = false; } - if (!strcmp(argv[i], "-q")) { quiet = true; } - } - - verbose = !quiet; - - //make references - rules = PolyARules(polyAChoice, polyALen, exceptionF); - refs.makeRefs(argv[1], refp, rules); - M = refs.getM(); - - //save references - sprintf(refF, "%s.seq", argv[3]); - refs.saveRefs(refF); - - sprintf(alignerFastaF, "%s.idx.fa", argv[3]); - fout.open(alignerFastaF); - for (int i = 1; i <= M; i++) { - fout<<">"<"<< refs.getRef(i).getName()<< endl<< refs.getRef(i).getSeq()<< endl; + } + fout.close(); + if (verbose) printf("%s is generated!\n", transF); + + sprintf(alignerFastaF, "%s.idx.fa", argv[3]); + fout.open(alignerFastaF); + for (int i = 1; i <= M; i++) { + fout<<">"< -Do not add poly(A) tails to the end of reference isoforms. (Default: add poly(A) tails to all transcripts) +Do not add poly(A) tails to the end of reference isoforms. (Default: adding poly(A) tails to all transcripts) =item B<--no-polyA-subset> @@ -207,7 +207,7 @@ This program will generate 'reference_name.grp', 'reference_name.ti', 'reference 'reference_name.grp', 'reference_name.ti', 'reference_name.seq', 'reference_name.idx.fa', and 'reference_name.chrlist' are used by RSEM internally. -B<'reference_name.transcripts.fa'> contains the extracted reference transcripts in FASTA format. Poly(A) tails are not added. +B<'reference_name.transcripts.fa'> contains the extracted reference transcripts in FASTA format. Poly(A) tails are added unless '--no-polyA' is set. =head1 EXAMPLES -- 2.39.2