#include "utils.h"
class GTFItem {
- public:
-
- GTFItem() {
- seqname = source = feature = "";
- score = "";
- start = end = 0;
- strand = 0; //strand is a char variable
- frame = "";
- gene_id = transcript_id = "";
- left = "";
- }
-
- bool operator<(const GTFItem& o) const {
- if (gene_id != o.gene_id) return gene_id < o.gene_id;
- if (transcript_id != o.transcript_id) return transcript_id < o.transcript_id;
- return start < o.start;
- }
-
- void my_assert(char value, std::string& line, const std::string& msg) {
- if (!value) {
- fprintf(stderr, ".gtf file might be corrupted!\n");
- fprintf(stderr, "Stop at line : %s\n", line.c_str());
- fprintf(stderr, "Error Message: %s\n", msg.c_str());
- exit(-1);
- }
- }
-
- void parse(std::string line) {
- std::istringstream strin(line);
- std::string tmp;
-
- getline(strin, seqname, '\t');
- getline(strin, source, '\t');
- getline(strin, feature, '\t');
- getline(strin, tmp, '\t');
- start = atoi(tmp.c_str());
- getline(strin, tmp, '\t');
- end = atoi(tmp.c_str());
- getline(strin, score, '\t');
- getline(strin, tmp, '\t');
- my_assert((tmp.length() == 1 && (tmp[0] == '+' || tmp[0] == '-')), line, "Strand is neither '+' nor '-'!");
- strand = tmp[0];
- getline(strin, frame, '\t');
-
- getline(strin, left); // assign attributes and possible comments into "left"
-
- strin.clear(); strin.str(left);
- bool find_gene_id = false, find_transcript_id = false;
-
- while (getline(strin, tmp, ';') && (!find_gene_id || !find_transcript_id)) {
- tmp = cleanStr(tmp);
- size_t pos = tmp.find(' ');
- my_assert((pos != std::string::npos), line, "Cannot separate the identifier from the value for attribute " + tmp + "!");
- std::string identifier = tmp.substr(0, pos);
-
- if (identifier == "gene_id") {
- my_assert(!find_gene_id, line, "gene_id appear more than once!");
- tmp = cleanStr(tmp.substr(pos));
- my_assert((tmp[0] == '"' && tmp[tmp.length() - 1] == '"'), line, "Textual attributes should be surrounded by doublequotes!");
- gene_id = tmp.substr(1, tmp.length() - 2);
- find_gene_id = true;
- } else if (identifier == "transcript_id") {
- my_assert(!find_transcript_id, line, "transcript_id appear more than once!");
- tmp = cleanStr(tmp.substr(pos));
- my_assert((tmp[0] == '"' && tmp[tmp.length() - 1] == '"'), line, "Textual attributes should be surrounded by doublequotes!");
- transcript_id = tmp.substr(1, tmp.length() - 2);
- find_transcript_id = true;
- }
- }
-
- my_assert(find_gene_id, line, "Cannot find gene_id!");
- my_assert(find_transcript_id, line, "Cannot find transcript_id!");
- }
-
- std::string getSeqName() { return seqname; }
- std::string getSource() { return source; }
- std::string getFeature() { return feature; }
- int getStart() { return start; }
- int getEnd() { return end; }
- char getStrand() { return strand; }
- std::string getScore() { return score; } // float, integer or "." ; let downstream programs parse it
- std::string getFrame() { return frame; } // 0, 1, 2, or "."; let downstream programs parse it
- std::string getGeneID() { return gene_id; }
- std::string getTranscriptID() { return transcript_id; }
- std::string getLeft() { return left; }
-
- void setGeneID(const std::string& gene_id) {
- this->gene_id = gene_id;
- }
-
- std::string toString() {
- std::string val;
- std::ostringstream strout;
- strout<<seqname<<'\t'<<source<<'\t'<<feature<<'\t'<<start<<'\t'<<end<<'\t'<<score<<'\t'<<strand<<'\t'<<frame<<'\t';
- strout<<"gene_id \""<<gene_id<<"\"; transcript_id \""<<transcript_id<<"\";"<<left;
- val = strout.str();
-
- return val;
- }
-
- private:
- std::string seqname, source, feature;
- std::string score;
- int start, end;
- char strand;
- std::string frame;
- std::string gene_id, transcript_id;
- std::string left;
+public:
+
+ GTFItem() {
+ seqname = source = feature = "";
+ score = "";
+ start = end = 0;
+ strand = 0; //strand is a char variable
+ frame = "";
+ gene_id = transcript_id = "";
+ left = "";
+ }
+
+ bool operator<(const GTFItem& o) const {
+ if (gene_id != o.gene_id) return gene_id < o.gene_id;
+ if (transcript_id != o.transcript_id) return transcript_id < o.transcript_id;
+ return start < o.start;
+ }
+
+ void my_assert(char value, std::string& line, const std::string& msg) {
+ if (!value) {
+ fprintf(stderr, ".gtf file might be corrupted!\n");
+ fprintf(stderr, "Stop at line : %s\n", line.c_str());
+ fprintf(stderr, "Error Message: %s\n", msg.c_str());
+ exit(-1);
+ }
+ }
+
+ void parse(std::string line) {
+ std::istringstream strin(line);
+ std::string tmp;
+
+ getline(strin, seqname, '\t');
+ getline(strin, source, '\t');
+ getline(strin, feature, '\t');
+ getline(strin, tmp, '\t');
+ start = atoi(tmp.c_str());
+ getline(strin, tmp, '\t');
+ end = atoi(tmp.c_str());
+ getline(strin, score, '\t');
+ getline(strin, tmp, '\t');
+ my_assert((tmp.length() == 1 && (tmp[0] == '+' || tmp[0] == '-')), line, "Strand is neither '+' nor '-'!");
+ strand = tmp[0];
+ getline(strin, frame, '\t');
+
+ getline(strin, left); // assign attributes and possible comments into "left"
+
+ strin.clear(); strin.str(left);
+ bool find_gene_id = false, find_transcript_id = false;
+
+ while (getline(strin, tmp, ';') && (!find_gene_id || !find_transcript_id)) {
+ tmp = cleanStr(tmp);
+ size_t pos = tmp.find(' ');
+ my_assert((pos != std::string::npos), line, "Cannot separate the identifier from the value for attribute " + tmp + "!");
+ std::string identifier = tmp.substr(0, pos);
+
+ if (identifier == "gene_id") {
+ my_assert(!find_gene_id, line, "gene_id appear more than once!");
+ tmp = cleanStr(tmp.substr(pos));
+ my_assert((tmp[0] == '"' && tmp[tmp.length() - 1] == '"'), line, "Textual attributes should be surrounded by doublequotes!");
+ gene_id = tmp.substr(1, tmp.length() - 2);
+ find_gene_id = true;
+ } else if (identifier == "transcript_id") {
+ my_assert(!find_transcript_id, line, "transcript_id appear more than once!");
+ tmp = cleanStr(tmp.substr(pos));
+ my_assert((tmp[0] == '"' && tmp[tmp.length() - 1] == '"'), line, "Textual attributes should be surrounded by doublequotes!");
+ transcript_id = tmp.substr(1, tmp.length() - 2);
+ find_transcript_id = true;
+ }
+ }
+
+ my_assert(find_gene_id, line, "Cannot find gene_id!");
+ my_assert(find_transcript_id, line, "Cannot find transcript_id!");
+ }
+
+ std::string getSeqName() { return seqname; }
+ std::string getSource() { return source; }
+ std::string getFeature() { return feature; }
+ int getStart() { return start; }
+ int getEnd() { return end; }
+ char getStrand() { return strand; }
+ std::string getScore() { return score; } // float, integer or "." ; let downstream programs parse it
+ std::string getFrame() { return frame; } // 0, 1, 2, or "."; let downstream programs parse it
+ std::string getGeneID() { return gene_id; }
+ std::string getTranscriptID() { return transcript_id; }
+ std::string getLeft() { return left; }
+
+ void setGeneID(const std::string& gene_id) {
+ this->gene_id = gene_id;
+ }
+
+ std::string toString() {
+ std::string val;
+ std::ostringstream strout;
+ strout<<seqname<<'\t'<<source<<'\t'<<feature<<'\t'<<start<<'\t'<<end<<'\t'<<score<<'\t'<<strand<<'\t'<<frame<<'\t';
+ strout<<"gene_id \""<<gene_id<<"\"; transcript_id \""<<transcript_id<<"\";"<<left;
+ val = strout.str();
+
+ return val;
+ }
+
+private:
+ std::string seqname, source, feature;
+ std::string score;
+ int start, end;
+ char strand;
+ std::string frame;
+ std::string gene_id, transcript_id;
+ std::string left;
};
#endif
## <a name="introduction"></a> Introduction
RSEM is a software package for estimating gene and isoform expression
-levels from RNA-Seq data. The new RSEM package (rsem-1.x) provides an
-user-friendly interface, supports threads for parallel computation of
-the EM algorithm, single-end and paired-end read data, quality scores,
-variable-length reads and RSPD estimation. It can also generate
-genomic-coordinate BAM files and UCSC wiggle files for
-visualization. In addition, it provides posterior mean and 95%
-credibility interval estimates for expression levels. For
-visualization, it can also generate transcript-coordinate BAM files
-and visualize them and also models learned.
+levels from RNA-Seq data. The RSEM package provides an user-friendly
+interface, supports threads for parallel computation of the EM
+algorithm, single-end and paired-end read data, quality scores,
+variable-length reads and RSPD estimation. In addition, it provides
+posterior mean and 95% credibility interval estimates for expression
+levels. For visualization, It can generate BAM and Wiggle files in
+both transcript-coordinate and genomic-coordinate. Genomic-coordinate
+files can be visualized by both UCSC Genome browser and Broad
+Institute's Integrative Genomics Viewer (IGV). Transcript-coordinate
+files can be visualized by IGV. RSEM also has its own scripts to
+generate transcript read depth plots in pdf format. The unique feature
+of RSEM is, the read depth plots can be stacked, with read depth
+contributed to unique reads shown in black and contributed to
+multi-reads shown in red. In addition, models learned from data can
+also be visualized. Last but not least, RSEM contains a simulator.
## <a name="compilation"></a> Compilation & Installation
For UCSC genome browser, please refer to the [UCSC custom track help page](http://genome.ucsc.edu/goldenPath/help/customTrack.html).
-For integrative genomics viewer, please refer to the [IGV home page](http://www.broadinstitute.org/software/igv/home).
+For integrative genomics viewer, please refer to the [IGV home page](http://www.broadinstitute.org/software/igv/home). Note: Although IGV can generate read depth plot from the BAM file given, it cannot recognize "ZW" tag RSEM puts. Therefore IGV counts each alignment as weight 1 instead of the expected weight for the plot it generates. So we recommend to use the wiggle file generated by RSEM for read depth visualization.
#### c) Generating Transcript Wiggle Plots
RSEM uses the [Boost C++](http://www.boost.org) and
[samtools](http://samtools.sourceforge.net) libraries.
+We thank earonesty for contributing patches.
+
## <a name="license"></a> License
RSEM is licensed under the [GNU General Public License v3](http://www.gnu.org/licenses/gpl-3.0.html).
--- /dev/null
+#!/usr/bin/perl
+
+use Getopt::Long;
+use Pod::Usage;
+use strict;
+
+my $standard_output = "&STDOUT";
+
+my $out_file = $standard_output;
+my @tmp_dirs = ();
+my $help = 0;
+
+GetOptions("o=s" => \$out_file,
+ "T|temporary-directory=s" => \@tmp_dirs,
+ "h|help" => \$help) or pd2usage(-exitval => 2, -verbose => 2);
+
+
+pod2usage(-verbose => 2) if ($help == 1);
+pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 2);
+
+my $command;
+my (@fields, @header) = ();
+my $M;
+
+# Load fields
+my @lines = ();
+my $type;
+
+open(INPUT, "$ARGV[0].ti");
+@lines = <INPUT>;
+chomp(@lines);
+close(INPUT);
+
+@fields = ();
+($M, $type) = split(/ /, $lines[0]);
+for (my $i = 0; $i < $M; $i++) {
+ push(@fields, "SN:$lines[$i * 6 + 1]");
+}
+
+
+# Reorder header
+my $line;
+
+open(INPUT, $ARGV[1]);
+@header = ();
+while (($line = <INPUT>) && substr($line, 0, 1) eq '@') {
+ chomp($line);
+ push(@header, $line);
+}
+close(INPUT);
+
+my $n = scalar(@header);
+if ($n > 0) {
+ my %hash = ();
+ my @ktable = ();
+
+ my $tid = 0;
+
+ for (my $i = 0; $i < $n; $i++) {
+ my @arr = split(/\t/, $header[$i]);
+ if ($arr[0] ne "\@SQ") { push(@ktable, ""); next; }
+ my $hasSN = 0;
+ foreach my $key (@arr) {
+ if (substr($key, 0, 3) eq "SN:") {
+ $hash{$key} = $i;
+ $hasSN = 1;
+ last;
+ }
+ }
+ if (!$hasSN) { print STDERR "\"$header[$i]\" does not have a SN tag!\n"; exit(-1); }
+ push(@ktable, $fields[$tid++]);
+ }
+
+ if ($tid != $M) { print STDERR "Number of \@SQ lines is not correct!\n"; exit(-1); }
+
+ open(OUTPUT, ">$out_file");
+ for (my $i = 0; $i < $n; $i++) {
+ if ($ktable[$i] eq "") { print OUTPUT $header[$i]; }
+ else { print OUTPUT $header[$hash{$ktable[$i]}]; }
+ print OUTPUT "\n";
+ }
+ close(OUTPUT);
+}
+
+
+# extract alignment section
+$command = "grep ^[^@] $ARGV[1] > $ARGV[1].__temp";
+&runCommand($command);
+
+# sort and output the alignment section
+$command = "sort -k 1,1 -s";
+if (scalar(@tmp_dirs) > 0) { $" = " -T "; $command .= " -T @tmp_dirs"; }
+$command .= " $ARGV[1].__temp";
+if ($out_file ne $standard_output) { $command .= " >> $out_file"; }
+&runCommand($command);
+
+# delete temporary files
+$command = "rm -f $ARGV[1].__temp";
+&runCommand($command);
+
+# finish
+print STDERR "Conversion is completed successfully!\n";
+
+# command, {err_msg}
+sub runCommand {
+ print STDERR $_[0]."\n";
+ my $status = system($_[0]);
+ if ($status != 0) {
+ my $errmsg;
+ if (scalar(@_) > 1) { $errmsg = $_[1]; }
+ else { $errmsg = "\"$command\" failed! Plase check if you provide correct parameters/options for the script!"; }
+ print STDERR $errmsg."\n";
+ exit(-1);
+ }
+ print STDERR "\n";
+}
+
+__END__
+
+=head1 NAME
+
+convert-sam-for-rsem
+
+=head1 SYNOPSIS
+
+=over
+
+ convert-sam-for-rsem [options] reference_name input_sam
+
+=back
+
+=head1 ARGUMENTS
+
+=over
+
+=item B<reference_name>
+
+The name of the reference used. This should be the same name used by 'rsem-prepare-reference'.
+
+=item B<input_sam>
+
+The SAM file (*.sam) generated by user's aligner. If the aligner produces a BAM file, please use samtools to convert it to a SAM file (with header information).
+
+=back
+
+=head1 OPTIONS
+
+=over
+
+=item B<-o> <file>
+
+Output the converted SAM file into <file>. (Default: STDOUT)
+
+=item B<-T/--temporary-directory> <directory>
+
+'convert-sam-for-rsem' will call 'sort' command and this is the '-T/--temporary-directory' option of 'sort' command. The following is the description from 'sort' : "use DIR for temporaries, not $TMPDIR or /tmp; multiple options specify multiple directories".
+
+=item B<-h/--help>
+
+Show help information.
+
+=back
+
+=head1 DESCRIPTION
+
+This program converts the SAM file generated by user's aligner into a SAM file which RSEM can process. However, users should make sure their aligners use 'reference_name.idx.fa' generated by 'rsem-prepare-reference' as their references. In addition, their aligners should output header information and make two mates of the same alignment adjacent to each other for paired-end data. This program will output the converted file into standard output by default for the purpose of piping. By setting '-o' option, users can make the converted file written into disk.
+
+Note: You do not need to run this script if Bowtie (not Bowtie 2) is used, or the order of @SQ lines is the same as 'reference_name.idx.fa' and the alignment lines of a same read group together and the mates of the same alignment are adjacent each other for paired-end reads.
+
+Note: This program can only recognize SAM files. See ARGUMENTS section.
+
+=head1 EXAMPLES
+
+Suppose reference_name and input_sam are set to '/ref/mouse_125' and 'input.sam'.
+
+1) Output to standard output and gzip the output to 'input_for_rsem.sam.gz':
+
+ convert-sam-for-rsem /ref/mouse_125 input.sam | gzip > input_for_rsem.sam.gz
+
+2) Output to 'input_for_rsem.sam' directly:
+
+ convert-sam-for-rsem /ref/mouse_125 input.sam -o input_for_rsem.sam
+
+=cut
+++ /dev/null
-#!/usr/bin/perl
-
-use Getopt::Long;
-use Pod::Usage;
-use strict;
-
-my $standard_output = "&STDOUT";
-
-my $out_file = $standard_output;
-my @tmp_dirs = ();
-my $help = 0;
-
-GetOptions("o=s" => \$out_file,
- "T|temporary-directory=s" => \@tmp_dirs,
- "h|help" => \$help) or pd2usage(-exitval => 2, -verbose => 2);
-
-
-pod2usage(-verbose => 2) if ($help == 1);
-pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 2);
-
-my $command;
-my (@fields, @header) = ();
-my $M;
-
-# Load fields
-my @lines = ();
-my $type;
-
-open(INPUT, "$ARGV[0].ti");
-@lines = <INPUT>;
-chomp(@lines);
-close(INPUT);
-
-@fields = ();
-($M, $type) = split(/ /, $lines[0]);
-for (my $i = 0; $i < $M; $i++) {
- push(@fields, "SN:$lines[$i * 6 + 1]");
-}
-
-
-# Reorder header
-my $line;
-
-open(INPUT, $ARGV[1]);
-@header = ();
-while (($line = <INPUT>) && substr($line, 0, 1) eq '@') {
- chomp($line);
- push(@header, $line);
-}
-close(INPUT);
-
-my $n = scalar(@header);
-if ($n > 0) {
- my %hash = ();
- my @ktable = ();
-
- my $tid = 0;
-
- for (my $i = 0; $i < $n; $i++) {
- my @arr = split(/\t/, $header[$i]);
- if ($arr[0] ne "\@SQ") { push(@ktable, ""); next; }
- my $hasSN = 0;
- foreach my $key (@arr) {
- if (substr($key, 0, 3) eq "SN:") {
- $hash{$key} = $i;
- $hasSN = 1;
- last;
- }
- }
- if (!$hasSN) { print STDERR "\"$header[$i]\" does not have a SN tag!\n"; exit(-1); }
- push(@ktable, $fields[$tid++]);
- }
-
- if ($tid != $M) { print STDERR "Number of \@SQ lines is not correct!\n"; exit(-1); }
-
- open(OUTPUT, ">$out_file");
- for (my $i = 0; $i < $n; $i++) {
- if ($ktable[$i] eq "") { print OUTPUT $header[$i]; }
- else { print OUTPUT $header[$hash{$ktable[$i]}]; }
- print OUTPUT "\n";
- }
- close(OUTPUT);
-}
-
-
-# extract alignment section
-$command = "grep ^[^@] $ARGV[1] > $ARGV[1].__temp";
-&runCommand($command);
-
-# sort and output the alignment section
-$command = "sort -k 1,1 -s";
-if (scalar(@tmp_dirs) > 0) { $" = " -T "; $command .= " -T @tmp_dirs"; }
-$command .= " $ARGV[1].__temp";
-if ($out_file ne $standard_output) { $command .= " >> $out_file"; }
-&runCommand($command);
-
-# delete temporary files
-$command = "rm -f $ARGV[1].__temp";
-&runCommand($command);
-
-# finish
-print STDERR "Conversion is completed successfully!\n";
-
-# command, {err_msg}
-sub runCommand {
- print STDERR $_[0]."\n";
- my $status = system($_[0]);
- if ($status != 0) {
- my $errmsg;
- if (scalar(@_) > 1) { $errmsg = $_[1]; }
- else { $errmsg = "\"$command\" failed! Plase check if you provide correct parameters/options for the script!"; }
- print STDERR $errmsg."\n";
- exit(-1);
- }
- print STDERR "\n";
-}
-
-__END__
-
-=head1 NAME
-
-convert_sam_for_rsem
-
-=head1 SYNOPSIS
-
-=over
-
- convert_sam_for_rsem [options] reference_name input_sam
-
-=back
-
-=head1 ARGUMENTS
-
-=over
-
-=item B<reference_name>
-
-The name of the reference used. This should be the same name used by 'rsem-prepare-reference'.
-
-=item B<input_sam>
-
-The SAM file (*.sam) generated by user's aligner. If the aligner produces a BAM file, please use samtools to convert it to a SAM file (with header information).
-
-=back
-
-=head1 OPTIONS
-
-=over
-
-=item B<-o> <file>
-
-Output the converted SAM file into <file>. (Default: STDOUT)
-
-=item B<-T/--temporary-directory> <directory>
-
-'convert_sam_for_rsem' will call 'sort' command and this is the '-T/--temporary-directory' option of 'sort' command. The following is the description from 'sort' : "use DIR for temporaries, not $TMPDIR or /tmp; multiple options specify multiple directories".
-
-=item B<-h/--help>
-
-Show help information.
-
-=back
-
-=head1 DESCRIPTION
-
-This program converts the SAM file generated by user's aligner into a SAM file which RSEM can process. However, users should make sure their aligners use 'reference_name.idx.fa' generated by 'rsem-prepare-reference' as their references. In addition, their aligners should output header information and make two mates of the same alignment adjacent to each other for paired-end data. This program will output the converted file into standard output by default for the purpose of piping. By setting '-o' option, users can make the converted file written into disk.
-
-Note: You do not need to run this script if Bowtie (not Bowtie 2) is used, or the order of @SQ lines is the same as 'reference_name.idx.fa' and the alignment lines of a same read group together and the mates of the same alignment are adjacent each other for paired-end reads.
-
-Note: This program can only recognize SAM files. See ARGUMENTS section.
-
-=head1 EXAMPLES
-
-Suppose reference_name and input_sam are set to '/ref/mouse_125' and 'input.sam'.
-
-1) Output to standard output and gzip the output to 'input_for_rsem.sam.gz':
-
- convert_sam_for_rsem /ref/mouse_125 input.sam | gzip > input_for_rsem.sam.gz
-
-2) Output to 'input_for_rsem.sam' directly:
-
- convert_sam_for_rsem /ref/mouse_125 input.sam -o input_for_rsem.sam
-
-=cut
--- /dev/null
+In this version of RSEM, several things are updated:
+
+- Added --chunkmbs option to rsem-calculate-expression (patch contributed by earonesty)
+- Added --sampling-for-bam option to rsem-calculate-expression, in the bam file, instead of providing expected weights, for each read RSEM samples one alignment based on the expected weights
+- RSEM can generate BAM and Wiggle files in both genomic-coordinate and transcript-coordinate
+- Added rsem-plot-transcript-wiggles. This script can generate transcript-coordinate wiggle plots in pdf format. One unique feature is, a stacked plot can be generated, with unique read contribution shown as black and multi-read contribution shown as red
+- Added convert_sam_for_rsem script for users do not use bowtie aligner
+- Modified RSEM's GTF file parser. Now RSEM does not require "transcript_id" and "gene_id" be the first two attributes shown
+- Improved descriptions for thread related errors.
+