From 229ba9d68e0a801907631887640ab475d51c560c Mon Sep 17 00:00:00 2001 From: Bo Li Date: Tue, 17 Jan 2012 23:36:57 -0600 Subject: [PATCH] rsem v1.1.14 --- GTFItem.h | 216 +++++++++---------- README.md | 28 ++- convert_sam_for_rsem => convert-sam-for-rsem | 10 +- what_is_new | 10 + 4 files changed, 141 insertions(+), 123 deletions(-) rename convert_sam_for_rsem => convert-sam-for-rsem (94%) create mode 100644 what_is_new diff --git a/GTFItem.h b/GTFItem.h index 4e50270..a908375 100644 --- a/GTFItem.h +++ b/GTFItem.h @@ -10,114 +10,114 @@ #include "utils.h" class GTFItem { - public: - - GTFItem() { - seqname = source = feature = ""; - score = ""; - start = end = 0; - strand = 0; //strand is a char variable - frame = ""; - gene_id = transcript_id = ""; - left = ""; - } - - bool operator<(const GTFItem& o) const { - if (gene_id != o.gene_id) return gene_id < o.gene_id; - if (transcript_id != o.transcript_id) return transcript_id < o.transcript_id; - return start < o.start; - } - - void my_assert(char value, std::string& line, const std::string& msg) { - if (!value) { - fprintf(stderr, ".gtf file might be corrupted!\n"); - fprintf(stderr, "Stop at line : %s\n", line.c_str()); - fprintf(stderr, "Error Message: %s\n", msg.c_str()); - exit(-1); - } - } - - void parse(std::string line) { - std::istringstream strin(line); - std::string tmp; - - getline(strin, seqname, '\t'); - getline(strin, source, '\t'); - getline(strin, feature, '\t'); - getline(strin, tmp, '\t'); - start = atoi(tmp.c_str()); - getline(strin, tmp, '\t'); - end = atoi(tmp.c_str()); - getline(strin, score, '\t'); - getline(strin, tmp, '\t'); - my_assert((tmp.length() == 1 && (tmp[0] == '+' || tmp[0] == '-')), line, "Strand is neither '+' nor '-'!"); - strand = tmp[0]; - getline(strin, frame, '\t'); - - getline(strin, left); // assign attributes and possible comments into "left" - - strin.clear(); strin.str(left); - bool find_gene_id = false, find_transcript_id = false; - - while (getline(strin, tmp, ';') && (!find_gene_id || !find_transcript_id)) { - tmp = cleanStr(tmp); - size_t pos = tmp.find(' '); - my_assert((pos != std::string::npos), line, "Cannot separate the identifier from the value for attribute " + tmp + "!"); - std::string identifier = tmp.substr(0, pos); - - if (identifier == "gene_id") { - my_assert(!find_gene_id, line, "gene_id appear more than once!"); - tmp = cleanStr(tmp.substr(pos)); - my_assert((tmp[0] == '"' && tmp[tmp.length() - 1] == '"'), line, "Textual attributes should be surrounded by doublequotes!"); - gene_id = tmp.substr(1, tmp.length() - 2); - find_gene_id = true; - } else if (identifier == "transcript_id") { - my_assert(!find_transcript_id, line, "transcript_id appear more than once!"); - tmp = cleanStr(tmp.substr(pos)); - my_assert((tmp[0] == '"' && tmp[tmp.length() - 1] == '"'), line, "Textual attributes should be surrounded by doublequotes!"); - transcript_id = tmp.substr(1, tmp.length() - 2); - find_transcript_id = true; - } - } - - my_assert(find_gene_id, line, "Cannot find gene_id!"); - my_assert(find_transcript_id, line, "Cannot find transcript_id!"); - } - - std::string getSeqName() { return seqname; } - std::string getSource() { return source; } - std::string getFeature() { return feature; } - int getStart() { return start; } - int getEnd() { return end; } - char getStrand() { return strand; } - std::string getScore() { return score; } // float, integer or "." ; let downstream programs parse it - std::string getFrame() { return frame; } // 0, 1, 2, or "."; let downstream programs parse it - std::string getGeneID() { return gene_id; } - std::string getTranscriptID() { return transcript_id; } - std::string getLeft() { return left; } - - void setGeneID(const std::string& gene_id) { - this->gene_id = gene_id; - } - - std::string toString() { - std::string val; - std::ostringstream strout; - strout<gene_id = gene_id; + } + + std::string toString() { + std::string val; + std::ostringstream strout; + strout< Introduction RSEM is a software package for estimating gene and isoform expression -levels from RNA-Seq data. The new RSEM package (rsem-1.x) provides an -user-friendly interface, supports threads for parallel computation of -the EM algorithm, single-end and paired-end read data, quality scores, -variable-length reads and RSPD estimation. It can also generate -genomic-coordinate BAM files and UCSC wiggle files for -visualization. In addition, it provides posterior mean and 95% -credibility interval estimates for expression levels. For -visualization, it can also generate transcript-coordinate BAM files -and visualize them and also models learned. +levels from RNA-Seq data. The RSEM package provides an user-friendly +interface, supports threads for parallel computation of the EM +algorithm, single-end and paired-end read data, quality scores, +variable-length reads and RSPD estimation. In addition, it provides +posterior mean and 95% credibility interval estimates for expression +levels. For visualization, It can generate BAM and Wiggle files in +both transcript-coordinate and genomic-coordinate. Genomic-coordinate +files can be visualized by both UCSC Genome browser and Broad +Institute's Integrative Genomics Viewer (IGV). Transcript-coordinate +files can be visualized by IGV. RSEM also has its own scripts to +generate transcript read depth plots in pdf format. The unique feature +of RSEM is, the read depth plots can be stacked, with read depth +contributed to unique reads shown in black and contributed to +multi-reads shown in red. In addition, models learned from data can +also be visualized. Last but not least, RSEM contains a simulator. ## Compilation & Installation @@ -152,7 +158,7 @@ wiggle_name: the name the user wants to use for this wiggle plot For UCSC genome browser, please refer to the [UCSC custom track help page](http://genome.ucsc.edu/goldenPath/help/customTrack.html). -For integrative genomics viewer, please refer to the [IGV home page](http://www.broadinstitute.org/software/igv/home). +For integrative genomics viewer, please refer to the [IGV home page](http://www.broadinstitute.org/software/igv/home). Note: Although IGV can generate read depth plot from the BAM file given, it cannot recognize "ZW" tag RSEM puts. Therefore IGV counts each alignment as weight 1 instead of the expected weight for the plot it generates. So we recommend to use the wiggle file generated by RSEM for read depth visualization. #### c) Generating Transcript Wiggle Plots @@ -258,6 +264,8 @@ map_file: transcript-to-gene-map file's name. RSEM uses the [Boost C++](http://www.boost.org) and [samtools](http://samtools.sourceforge.net) libraries. +We thank earonesty for contributing patches. + ## License RSEM is licensed under the [GNU General Public License v3](http://www.gnu.org/licenses/gpl-3.0.html). diff --git a/convert_sam_for_rsem b/convert-sam-for-rsem similarity index 94% rename from convert_sam_for_rsem rename to convert-sam-for-rsem index a6bc124..68a42d7 100755 --- a/convert_sam_for_rsem +++ b/convert-sam-for-rsem @@ -119,13 +119,13 @@ __END__ =head1 NAME -convert_sam_for_rsem +convert-sam-for-rsem =head1 SYNOPSIS =over - convert_sam_for_rsem [options] reference_name input_sam + convert-sam-for-rsem [options] reference_name input_sam =back @@ -153,7 +153,7 @@ Output the converted SAM file into . (Default: STDOUT) =item B<-T/--temporary-directory> -'convert_sam_for_rsem' will call 'sort' command and this is the '-T/--temporary-directory' option of 'sort' command. The following is the description from 'sort' : "use DIR for temporaries, not $TMPDIR or /tmp; multiple options specify multiple directories". +'convert-sam-for-rsem' will call 'sort' command and this is the '-T/--temporary-directory' option of 'sort' command. The following is the description from 'sort' : "use DIR for temporaries, not $TMPDIR or /tmp; multiple options specify multiple directories". =item B<-h/--help> @@ -175,10 +175,10 @@ Suppose reference_name and input_sam are set to '/ref/mouse_125' and 'input.sam' 1) Output to standard output and gzip the output to 'input_for_rsem.sam.gz': - convert_sam_for_rsem /ref/mouse_125 input.sam | gzip > input_for_rsem.sam.gz + convert-sam-for-rsem /ref/mouse_125 input.sam | gzip > input_for_rsem.sam.gz 2) Output to 'input_for_rsem.sam' directly: - convert_sam_for_rsem /ref/mouse_125 input.sam -o input_for_rsem.sam + convert-sam-for-rsem /ref/mouse_125 input.sam -o input_for_rsem.sam =cut diff --git a/what_is_new b/what_is_new new file mode 100644 index 0000000..7168193 --- /dev/null +++ b/what_is_new @@ -0,0 +1,10 @@ +In this version of RSEM, several things are updated: + +- Added --chunkmbs option to rsem-calculate-expression (patch contributed by earonesty) +- Added --sampling-for-bam option to rsem-calculate-expression, in the bam file, instead of providing expected weights, for each read RSEM samples one alignment based on the expected weights +- RSEM can generate BAM and Wiggle files in both genomic-coordinate and transcript-coordinate +- Added rsem-plot-transcript-wiggles. This script can generate transcript-coordinate wiggle plots in pdf format. One unique feature is, a stacked plot can be generated, with unique read contribution shown as black and multi-read contribution shown as red +- Added convert_sam_for_rsem script for users do not use bowtie aligner +- Modified RSEM's GTF file parser. Now RSEM does not require "transcript_id" and "gene_id" be the first two attributes shown +- Improved descriptions for thread related errors. + -- 2.39.2