use Getopt::Long;
use Pod::Usage;
use File::Basename;
-use Switch;
use strict;
+use rsem_perl_utils qw(runCommand collectResults showVersionInfo);
+
#const
my $BURNIN = 200;
my $NCV = 1000;
my $strand_specific = 0;
+my $version = 0;
+
my $mTime = 0;
my ($time_start, $time_end, $time_alignment, $time_rsem, $time_ci) = (0, 0, 0, 0, 0);
"calc-ci" => \$calcCI,
"ci-memory=i" => \$NMB,
"time" => \$mTime,
+ "version" => \$version,
"q|quiet" => \$quiet,
"h|help" => \$help) or pod2usage(-exitval => 2, -verbose => 2);
-pod2usage(-verbose => 2) if ($help == 1);
+my ($fn, $dir, $suf) = fileparse($0);
+pod2usage(-verbose => 2) if ($help == 1);
+&showVersionInfo($dir) if ($version == 1);
#check parameters and options
if ($bowtie_path ne "") { $bowtie_path .= "/"; }
-my ($fn, $dir, $suf) = fileparse($0);
my $command = "";
if (!$is_sam && !$is_bam) {
&runCommand($command);
$command = $dir."rsem-build-read-index $gap";
-switch($read_type) {
- case 0 { $command .= " 0 $quiet $imdName\_alignable.fa"; }
- case 1 { $command .= " 1 $quiet $imdName\_alignable.fq"; }
- case 2 { $command .= " 0 $quiet $imdName\_alignable_1.fa $imdName\_alignable_2.fa"; }
- case 3 { $command .= " 1 $quiet $imdName\_alignable_1.fq $imdName\_alignable_2.fq"; }
-}
+if ($read_type == 0) { $command .= " 0 $quiet $imdName\_alignable.fa"; }
+elsif ($read_type == 1) { $command .= " 1 $quiet $imdName\_alignable.fq"; }
+elsif ($read_type == 2) { $command .= " 0 $quiet $imdName\_alignable_1.fa $imdName\_alignable_2.fa"; }
+elsif ($read_type == 3) { $command .= " 1 $quiet $imdName\_alignable_1.fq $imdName\_alignable_2.fq"; }
+else { print "Impossible! read_type is not in [1,2,3,4]!\n"; exit(-1); }
&runCommand($command);
my $doesOpen = open(OUTPUT, ">$imdName.mparams");
&runCommand($command);
-&collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
-&collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level
+&collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+&collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
if ($genBamF) {
$command = $dir."sam/samtools sort $sampleName.transcript.bam $sampleName.transcript.sorted";
if ($calcCI) {
system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak1");
system("mv $sampleName.genes.results $imdName.genes.results.bak1");
- &collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
- &collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level
+ &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+ &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
$command = $dir."rsem-calculate-credibility-intervals $refName $imdName $statName $CONFIDENCE $NCV $NSPC $NMB";
$command .= " -p $nThreads";
system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak2");
system("mv $sampleName.genes.results $imdName.genes.results.bak2");
- &collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
- &collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level
+ &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+ &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
}
if ($mTime) { $time_end = time(); $time_ci = $time_end - $time_start; }
close(OUTPUT);
}
-# command, {err_msg}
-sub runCommand {
- print $_[0]."\n";
- my $status = system($_[0]);
- if ($status != 0) {
- my $errmsg;
- if (scalar(@_) > 1) { $errmsg = $_[1]; }
- else { $errmsg = "\"$command\" failed! Plase check if you provide correct parameters/options for the pipeline!"; }
- print $errmsg."\n";
- exit(-1);
- }
- print "\n";
-}
-
-# inpF, outF
-sub collectResults {
- my $local_status;
- my ($inpF, $outF);
- my (@results, @ids) = ();
- my $line;
- my $cnt;
-
- $inpF = $_[0];
- $outF = $_[1];
-
- $local_status = open(INPUT, $inpF);
- if ($local_status == 0) { print "Fail to open file $inpF!\n"; exit(-1); }
-
- $cnt = 0;
- @results = ();
-
- while ($line = <INPUT>) {
- ++$cnt;
- chomp($line);
- my @local_arr = split(/\t/, $line);
- if ($cnt == 4) { @ids = @local_arr; }
- else { push(@results, \@local_arr); }
- }
-
- push(@results, \@ids);
- close(INPUT);
-
- $local_status = open(OUTPUT, ">$outF");
- if ($local_status == 0) { print "Fail to create file $outF!\n"; exit(-1); }
-
- my $n = scalar(@results);
- my $m = scalar(@{$results[0]});
- for (my $i = 0; $i < $m; $i++) {
- my @out_arr = ();
- for (my $j = 0; $j < $n; $j++) { push(@out_arr, $results[$j][$i]); }
- $" = "\t";
- print OUTPUT "@out_arr\n";
- }
- close(OUTPUT);
-}
-
-
__END__
=head1 NAME
=over
-=item B<sample_name.genes.results>
+=item B<sample_name.isoforms.results>
-File containing gene level expression estimates. The format of each
-line in this file is:
+File containing isoform level expression estimates. The first line
+contains column names separated by the tab character. The format of
+each line in the rest of this file is:
-gene_id expected_counts tau_value [pmc_value tau_pme_value tau_ci_lower_bound tau_ci_upper_bound] transcript_id_list
+transcript_id gene_id length effective_length expected_count TPM FPKM IsoPct [pme_expected_count pme_TPM pme_FPKM IsoPct_from_pme_TPM TPM_ci_lower_bound TPM_ci_upper_bound FPKM_ci_lower_bound FPKM_ci_upper_bound]
Fields are separated by the tab character. Fields within "[]" are only
-presented if '--calc-ci' is set. pme stands for posterior mean
-estimation. pmc stands for posterior mean counts. ci_lower_bound(l)
-means the lower bound of the credibility intervals, ci_upper_bound(u)
-means the upper bound of the credibility intervals. So the credibility
-interval is [l, u]. 'transcript_id_list' is a space-separated list of
-transcript_ids belonging to the gene. If no gene information is
-provided, this file has the same content as
-'sample_name.isoforms.results'.
+presented if '--calc-ci' is set.
-=item B<sample_name.isoforms.results>
+'transcript_id' is the transcript name of this transcript. 'gene_id'
+is the gene name of the gene which this transcript belongs to (denote
+this gene as its parent gene). If no gene information is provided,
+'gene_id' and 'transcript_id' are the same.
+
+'length' is this transcript's sequence length (poly(A) tail is not
+counted). 'effective_length' counts only the positions that can
+generate a valid fragment. If no poly(A) tail is added,
+'effective_length' is equal to transcript length - mean fragment
+length + 1. If one transcript's effective length is less than 1, this
+transcript's both effective length and abundance estimates are set to
+0.
+
+'expected_count' is the sum of the posterior probability of each read
+comes from this transcript over all reads. Because 1) each read
+aligning to this transcript has a probability of being generated from
+background noise; 2) RSEM may filter some alignable low quality reads,
+the sum of expected counts for all transcript are generally less than
+the total number of reads aligned.
+
+'TPM' stands for Transcripts Per Million. It is a relative measure of
+transcript abundance. The sum of all transcripts' TPM is 1
+million. 'FPKM' stands for Fragments Per Kilobase of transcript per
+Million mapped reads. It is another relative measure of transcript
+abundance. If we define l_bar be the mean transcript length in a
+sample, which can be calculated as
+
+l_bar = \sum_i TPM_i / 10^6 * effective_length_i (i goes through every transcript),
-File containing isoform level expression values. The format of each
-line in this file is:
+the following equation is hold:
+
+FPKM_i = 10^3 / l_bar * TPM_i.
+
+We can see that the sum of FPKM is not a constant across samples.
+
+'IsoPct' stands for isoform percentage. It is the percentage of this
+transcript's abandunce over its parent gene's abandunce. If its parent
+gene has only one isoform or the gene information is not provided,
+this field will be set to 100.
+
+'pme_expected_count', 'pme_TPM', 'pme_FPKM' are posterior mean
+estimates calculated by RSEM's Gibbs sampler. 'IsoPct_from_pme_TPM' is
+the isoform percentage calculated from 'pme_TPM' values.
+
+'TPM_ci_lower_bound', 'TPM_ci_upper_bound', 'FPKM_ci_lower_bound' and
+'FPKM_ci_upper_bound' are lower(l) and upper(u) bounds of 95%
+credibility intervals for TPM and FPKM values. The bounds are
+inclusive (i.e. [l, u]).
+
+=item B<sample_name.genes.results>
+
+File containing gene level expression estimates. The first line
+contains column names separated by the tab character. The format of
+each line in the rest of this file is:
+
+gene_id transcript_id(s) length effective_length expected_count TPM FPKM [pme_expected_count pme_TPM pme_FPKM TPM_ci_lower_bound TPM_ci_upper_bound FPKM_ci_lower_bound FPKM_ci_upper_bound]
+
+Fields are separated by the tab character. Fields within "[]" are only
+presented if '--calc-ci' is set.
-transcript_id expected_counts tau_value [pmc_value tau_pme_value tau_ci_lower_bound tau_ci_upper_bound] gene_id
+'transcript_id(s)' is a comma-separated list of transcript_ids
+belonging to this gene. If no gene information is provided, 'gene_id'
+and 'transcript_id(s)' are identical (the 'transcript_id').
-Fields are separated by the tab character. 'gene_id' is the gene_id of
-the gene which this transcript belongs to. If no gene information is
-provided, 'gene_id' and 'transcript_id' are the same.
+A gene's 'length' and 'effective_length' are
+defined as the weighted average of its transcripts' lengths and
+effective lengths (weighted by 'IsoPct'). A gene's abundance estimates
+are just the sum of its transcripts' abundance estimates.
=item B<sample_name.transcript.bam, sample_name.transcript.sorted.bam and sample_name.transcript.sorted.bam.bai>