From: Bo Li Date: Wed, 18 Jan 2012 05:36:57 +0000 (-0600) Subject: rsem v1.1.14 X-Git-Url: https://git.donarmstrong.com/?p=rsem.git;a=commitdiff_plain;h=229ba9d68e0a801907631887640ab475d51c560c rsem v1.1.14 --- diff --git a/GTFItem.h b/GTFItem.h index 4e50270..a908375 100644 --- a/GTFItem.h +++ b/GTFItem.h @@ -10,114 +10,114 @@ #include "utils.h" class GTFItem { - public: - - GTFItem() { - seqname = source = feature = ""; - score = ""; - start = end = 0; - strand = 0; //strand is a char variable - frame = ""; - gene_id = transcript_id = ""; - left = ""; - } - - bool operator<(const GTFItem& o) const { - if (gene_id != o.gene_id) return gene_id < o.gene_id; - if (transcript_id != o.transcript_id) return transcript_id < o.transcript_id; - return start < o.start; - } - - void my_assert(char value, std::string& line, const std::string& msg) { - if (!value) { - fprintf(stderr, ".gtf file might be corrupted!\n"); - fprintf(stderr, "Stop at line : %s\n", line.c_str()); - fprintf(stderr, "Error Message: %s\n", msg.c_str()); - exit(-1); - } - } - - void parse(std::string line) { - std::istringstream strin(line); - std::string tmp; - - getline(strin, seqname, '\t'); - getline(strin, source, '\t'); - getline(strin, feature, '\t'); - getline(strin, tmp, '\t'); - start = atoi(tmp.c_str()); - getline(strin, tmp, '\t'); - end = atoi(tmp.c_str()); - getline(strin, score, '\t'); - getline(strin, tmp, '\t'); - my_assert((tmp.length() == 1 && (tmp[0] == '+' || tmp[0] == '-')), line, "Strand is neither '+' nor '-'!"); - strand = tmp[0]; - getline(strin, frame, '\t'); - - getline(strin, left); // assign attributes and possible comments into "left" - - strin.clear(); strin.str(left); - bool find_gene_id = false, find_transcript_id = false; - - while (getline(strin, tmp, ';') && (!find_gene_id || !find_transcript_id)) { - tmp = cleanStr(tmp); - size_t pos = tmp.find(' '); - my_assert((pos != std::string::npos), line, "Cannot separate the identifier from the value for attribute " + tmp + "!"); - std::string identifier = tmp.substr(0, pos); - - if (identifier == "gene_id") { - my_assert(!find_gene_id, line, "gene_id appear more than once!"); - tmp = cleanStr(tmp.substr(pos)); - my_assert((tmp[0] == '"' && tmp[tmp.length() - 1] == '"'), line, "Textual attributes should be surrounded by doublequotes!"); - gene_id = tmp.substr(1, tmp.length() - 2); - find_gene_id = true; - } else if (identifier == "transcript_id") { - my_assert(!find_transcript_id, line, "transcript_id appear more than once!"); - tmp = cleanStr(tmp.substr(pos)); - my_assert((tmp[0] == '"' && tmp[tmp.length() - 1] == '"'), line, "Textual attributes should be surrounded by doublequotes!"); - transcript_id = tmp.substr(1, tmp.length() - 2); - find_transcript_id = true; - } - } - - my_assert(find_gene_id, line, "Cannot find gene_id!"); - my_assert(find_transcript_id, line, "Cannot find transcript_id!"); - } - - std::string getSeqName() { return seqname; } - std::string getSource() { return source; } - std::string getFeature() { return feature; } - int getStart() { return start; } - int getEnd() { return end; } - char getStrand() { return strand; } - std::string getScore() { return score; } // float, integer or "." ; let downstream programs parse it - std::string getFrame() { return frame; } // 0, 1, 2, or "."; let downstream programs parse it - std::string getGeneID() { return gene_id; } - std::string getTranscriptID() { return transcript_id; } - std::string getLeft() { return left; } - - void setGeneID(const std::string& gene_id) { - this->gene_id = gene_id; - } - - std::string toString() { - std::string val; - std::ostringstream strout; - strout<gene_id = gene_id; + } + + std::string toString() { + std::string val; + std::ostringstream strout; + strout< Introduction RSEM is a software package for estimating gene and isoform expression -levels from RNA-Seq data. The new RSEM package (rsem-1.x) provides an -user-friendly interface, supports threads for parallel computation of -the EM algorithm, single-end and paired-end read data, quality scores, -variable-length reads and RSPD estimation. It can also generate -genomic-coordinate BAM files and UCSC wiggle files for -visualization. In addition, it provides posterior mean and 95% -credibility interval estimates for expression levels. For -visualization, it can also generate transcript-coordinate BAM files -and visualize them and also models learned. +levels from RNA-Seq data. The RSEM package provides an user-friendly +interface, supports threads for parallel computation of the EM +algorithm, single-end and paired-end read data, quality scores, +variable-length reads and RSPD estimation. In addition, it provides +posterior mean and 95% credibility interval estimates for expression +levels. For visualization, It can generate BAM and Wiggle files in +both transcript-coordinate and genomic-coordinate. Genomic-coordinate +files can be visualized by both UCSC Genome browser and Broad +Institute's Integrative Genomics Viewer (IGV). Transcript-coordinate +files can be visualized by IGV. RSEM also has its own scripts to +generate transcript read depth plots in pdf format. The unique feature +of RSEM is, the read depth plots can be stacked, with read depth +contributed to unique reads shown in black and contributed to +multi-reads shown in red. In addition, models learned from data can +also be visualized. Last but not least, RSEM contains a simulator. ## Compilation & Installation @@ -152,7 +158,7 @@ wiggle_name: the name the user wants to use for this wiggle plot For UCSC genome browser, please refer to the [UCSC custom track help page](http://genome.ucsc.edu/goldenPath/help/customTrack.html). -For integrative genomics viewer, please refer to the [IGV home page](http://www.broadinstitute.org/software/igv/home). +For integrative genomics viewer, please refer to the [IGV home page](http://www.broadinstitute.org/software/igv/home). Note: Although IGV can generate read depth plot from the BAM file given, it cannot recognize "ZW" tag RSEM puts. Therefore IGV counts each alignment as weight 1 instead of the expected weight for the plot it generates. So we recommend to use the wiggle file generated by RSEM for read depth visualization. #### c) Generating Transcript Wiggle Plots @@ -258,6 +264,8 @@ map_file: transcript-to-gene-map file's name. RSEM uses the [Boost C++](http://www.boost.org) and [samtools](http://samtools.sourceforge.net) libraries. +We thank earonesty for contributing patches. + ## License RSEM is licensed under the [GNU General Public License v3](http://www.gnu.org/licenses/gpl-3.0.html). diff --git a/convert-sam-for-rsem b/convert-sam-for-rsem new file mode 100755 index 0000000..68a42d7 --- /dev/null +++ b/convert-sam-for-rsem @@ -0,0 +1,184 @@ +#!/usr/bin/perl + +use Getopt::Long; +use Pod::Usage; +use strict; + +my $standard_output = "&STDOUT"; + +my $out_file = $standard_output; +my @tmp_dirs = (); +my $help = 0; + +GetOptions("o=s" => \$out_file, + "T|temporary-directory=s" => \@tmp_dirs, + "h|help" => \$help) or pd2usage(-exitval => 2, -verbose => 2); + + +pod2usage(-verbose => 2) if ($help == 1); +pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 2); + +my $command; +my (@fields, @header) = (); +my $M; + +# Load fields +my @lines = (); +my $type; + +open(INPUT, "$ARGV[0].ti"); +@lines = ; +chomp(@lines); +close(INPUT); + +@fields = (); +($M, $type) = split(/ /, $lines[0]); +for (my $i = 0; $i < $M; $i++) { + push(@fields, "SN:$lines[$i * 6 + 1]"); +} + + +# Reorder header +my $line; + +open(INPUT, $ARGV[1]); +@header = (); +while (($line = ) && substr($line, 0, 1) eq '@') { + chomp($line); + push(@header, $line); +} +close(INPUT); + +my $n = scalar(@header); +if ($n > 0) { + my %hash = (); + my @ktable = (); + + my $tid = 0; + + for (my $i = 0; $i < $n; $i++) { + my @arr = split(/\t/, $header[$i]); + if ($arr[0] ne "\@SQ") { push(@ktable, ""); next; } + my $hasSN = 0; + foreach my $key (@arr) { + if (substr($key, 0, 3) eq "SN:") { + $hash{$key} = $i; + $hasSN = 1; + last; + } + } + if (!$hasSN) { print STDERR "\"$header[$i]\" does not have a SN tag!\n"; exit(-1); } + push(@ktable, $fields[$tid++]); + } + + if ($tid != $M) { print STDERR "Number of \@SQ lines is not correct!\n"; exit(-1); } + + open(OUTPUT, ">$out_file"); + for (my $i = 0; $i < $n; $i++) { + if ($ktable[$i] eq "") { print OUTPUT $header[$i]; } + else { print OUTPUT $header[$hash{$ktable[$i]}]; } + print OUTPUT "\n"; + } + close(OUTPUT); +} + + +# extract alignment section +$command = "grep ^[^@] $ARGV[1] > $ARGV[1].__temp"; +&runCommand($command); + +# sort and output the alignment section +$command = "sort -k 1,1 -s"; +if (scalar(@tmp_dirs) > 0) { $" = " -T "; $command .= " -T @tmp_dirs"; } +$command .= " $ARGV[1].__temp"; +if ($out_file ne $standard_output) { $command .= " >> $out_file"; } +&runCommand($command); + +# delete temporary files +$command = "rm -f $ARGV[1].__temp"; +&runCommand($command); + +# finish +print STDERR "Conversion is completed successfully!\n"; + +# command, {err_msg} +sub runCommand { + print STDERR $_[0]."\n"; + my $status = system($_[0]); + if ($status != 0) { + my $errmsg; + if (scalar(@_) > 1) { $errmsg = $_[1]; } + else { $errmsg = "\"$command\" failed! Plase check if you provide correct parameters/options for the script!"; } + print STDERR $errmsg."\n"; + exit(-1); + } + print STDERR "\n"; +} + +__END__ + +=head1 NAME + +convert-sam-for-rsem + +=head1 SYNOPSIS + +=over + + convert-sam-for-rsem [options] reference_name input_sam + +=back + +=head1 ARGUMENTS + +=over + +=item B + +The name of the reference used. This should be the same name used by 'rsem-prepare-reference'. + +=item B + +The SAM file (*.sam) generated by user's aligner. If the aligner produces a BAM file, please use samtools to convert it to a SAM file (with header information). + +=back + +=head1 OPTIONS + +=over + +=item B<-o> + +Output the converted SAM file into . (Default: STDOUT) + +=item B<-T/--temporary-directory> + +'convert-sam-for-rsem' will call 'sort' command and this is the '-T/--temporary-directory' option of 'sort' command. The following is the description from 'sort' : "use DIR for temporaries, not $TMPDIR or /tmp; multiple options specify multiple directories". + +=item B<-h/--help> + +Show help information. + +=back + +=head1 DESCRIPTION + +This program converts the SAM file generated by user's aligner into a SAM file which RSEM can process. However, users should make sure their aligners use 'reference_name.idx.fa' generated by 'rsem-prepare-reference' as their references. In addition, their aligners should output header information and make two mates of the same alignment adjacent to each other for paired-end data. This program will output the converted file into standard output by default for the purpose of piping. By setting '-o' option, users can make the converted file written into disk. + +Note: You do not need to run this script if Bowtie (not Bowtie 2) is used, or the order of @SQ lines is the same as 'reference_name.idx.fa' and the alignment lines of a same read group together and the mates of the same alignment are adjacent each other for paired-end reads. + +Note: This program can only recognize SAM files. See ARGUMENTS section. + +=head1 EXAMPLES + +Suppose reference_name and input_sam are set to '/ref/mouse_125' and 'input.sam'. + +1) Output to standard output and gzip the output to 'input_for_rsem.sam.gz': + + convert-sam-for-rsem /ref/mouse_125 input.sam | gzip > input_for_rsem.sam.gz + +2) Output to 'input_for_rsem.sam' directly: + + convert-sam-for-rsem /ref/mouse_125 input.sam -o input_for_rsem.sam + +=cut diff --git a/convert_sam_for_rsem b/convert_sam_for_rsem deleted file mode 100755 index a6bc124..0000000 --- a/convert_sam_for_rsem +++ /dev/null @@ -1,184 +0,0 @@ -#!/usr/bin/perl - -use Getopt::Long; -use Pod::Usage; -use strict; - -my $standard_output = "&STDOUT"; - -my $out_file = $standard_output; -my @tmp_dirs = (); -my $help = 0; - -GetOptions("o=s" => \$out_file, - "T|temporary-directory=s" => \@tmp_dirs, - "h|help" => \$help) or pd2usage(-exitval => 2, -verbose => 2); - - -pod2usage(-verbose => 2) if ($help == 1); -pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 2); - -my $command; -my (@fields, @header) = (); -my $M; - -# Load fields -my @lines = (); -my $type; - -open(INPUT, "$ARGV[0].ti"); -@lines = ; -chomp(@lines); -close(INPUT); - -@fields = (); -($M, $type) = split(/ /, $lines[0]); -for (my $i = 0; $i < $M; $i++) { - push(@fields, "SN:$lines[$i * 6 + 1]"); -} - - -# Reorder header -my $line; - -open(INPUT, $ARGV[1]); -@header = (); -while (($line = ) && substr($line, 0, 1) eq '@') { - chomp($line); - push(@header, $line); -} -close(INPUT); - -my $n = scalar(@header); -if ($n > 0) { - my %hash = (); - my @ktable = (); - - my $tid = 0; - - for (my $i = 0; $i < $n; $i++) { - my @arr = split(/\t/, $header[$i]); - if ($arr[0] ne "\@SQ") { push(@ktable, ""); next; } - my $hasSN = 0; - foreach my $key (@arr) { - if (substr($key, 0, 3) eq "SN:") { - $hash{$key} = $i; - $hasSN = 1; - last; - } - } - if (!$hasSN) { print STDERR "\"$header[$i]\" does not have a SN tag!\n"; exit(-1); } - push(@ktable, $fields[$tid++]); - } - - if ($tid != $M) { print STDERR "Number of \@SQ lines is not correct!\n"; exit(-1); } - - open(OUTPUT, ">$out_file"); - for (my $i = 0; $i < $n; $i++) { - if ($ktable[$i] eq "") { print OUTPUT $header[$i]; } - else { print OUTPUT $header[$hash{$ktable[$i]}]; } - print OUTPUT "\n"; - } - close(OUTPUT); -} - - -# extract alignment section -$command = "grep ^[^@] $ARGV[1] > $ARGV[1].__temp"; -&runCommand($command); - -# sort and output the alignment section -$command = "sort -k 1,1 -s"; -if (scalar(@tmp_dirs) > 0) { $" = " -T "; $command .= " -T @tmp_dirs"; } -$command .= " $ARGV[1].__temp"; -if ($out_file ne $standard_output) { $command .= " >> $out_file"; } -&runCommand($command); - -# delete temporary files -$command = "rm -f $ARGV[1].__temp"; -&runCommand($command); - -# finish -print STDERR "Conversion is completed successfully!\n"; - -# command, {err_msg} -sub runCommand { - print STDERR $_[0]."\n"; - my $status = system($_[0]); - if ($status != 0) { - my $errmsg; - if (scalar(@_) > 1) { $errmsg = $_[1]; } - else { $errmsg = "\"$command\" failed! Plase check if you provide correct parameters/options for the script!"; } - print STDERR $errmsg."\n"; - exit(-1); - } - print STDERR "\n"; -} - -__END__ - -=head1 NAME - -convert_sam_for_rsem - -=head1 SYNOPSIS - -=over - - convert_sam_for_rsem [options] reference_name input_sam - -=back - -=head1 ARGUMENTS - -=over - -=item B - -The name of the reference used. This should be the same name used by 'rsem-prepare-reference'. - -=item B - -The SAM file (*.sam) generated by user's aligner. If the aligner produces a BAM file, please use samtools to convert it to a SAM file (with header information). - -=back - -=head1 OPTIONS - -=over - -=item B<-o> - -Output the converted SAM file into . (Default: STDOUT) - -=item B<-T/--temporary-directory> - -'convert_sam_for_rsem' will call 'sort' command and this is the '-T/--temporary-directory' option of 'sort' command. The following is the description from 'sort' : "use DIR for temporaries, not $TMPDIR or /tmp; multiple options specify multiple directories". - -=item B<-h/--help> - -Show help information. - -=back - -=head1 DESCRIPTION - -This program converts the SAM file generated by user's aligner into a SAM file which RSEM can process. However, users should make sure their aligners use 'reference_name.idx.fa' generated by 'rsem-prepare-reference' as their references. In addition, their aligners should output header information and make two mates of the same alignment adjacent to each other for paired-end data. This program will output the converted file into standard output by default for the purpose of piping. By setting '-o' option, users can make the converted file written into disk. - -Note: You do not need to run this script if Bowtie (not Bowtie 2) is used, or the order of @SQ lines is the same as 'reference_name.idx.fa' and the alignment lines of a same read group together and the mates of the same alignment are adjacent each other for paired-end reads. - -Note: This program can only recognize SAM files. See ARGUMENTS section. - -=head1 EXAMPLES - -Suppose reference_name and input_sam are set to '/ref/mouse_125' and 'input.sam'. - -1) Output to standard output and gzip the output to 'input_for_rsem.sam.gz': - - convert_sam_for_rsem /ref/mouse_125 input.sam | gzip > input_for_rsem.sam.gz - -2) Output to 'input_for_rsem.sam' directly: - - convert_sam_for_rsem /ref/mouse_125 input.sam -o input_for_rsem.sam - -=cut diff --git a/what_is_new b/what_is_new new file mode 100644 index 0000000..7168193 --- /dev/null +++ b/what_is_new @@ -0,0 +1,10 @@ +In this version of RSEM, several things are updated: + +- Added --chunkmbs option to rsem-calculate-expression (patch contributed by earonesty) +- Added --sampling-for-bam option to rsem-calculate-expression, in the bam file, instead of providing expected weights, for each read RSEM samples one alignment based on the expected weights +- RSEM can generate BAM and Wiggle files in both genomic-coordinate and transcript-coordinate +- Added rsem-plot-transcript-wiggles. This script can generate transcript-coordinate wiggle plots in pdf format. One unique feature is, a stacked plot can be generated, with unique read contribution shown as black and multi-read contribution shown as red +- Added convert_sam_for_rsem script for users do not use bowtie aligner +- Modified RSEM's GTF file parser. Now RSEM does not require "transcript_id" and "gene_id" be the first two attributes shown +- Improved descriptions for thread related errors. +