X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=rsem-calculate-expression;h=62339cb4aa41b1e43a5da611a8bb110f018cf399;hb=80182d3120cde6e3831fc3ea99af91f20dade08b;hp=684e33c2c3eb713536e3f1db6a82d44acda0bbe2;hpb=683863b75f8d8bef2461039a6911b0e9619cc113;p=rsem.git diff --git a/rsem-calculate-expression b/rsem-calculate-expression index 684e33c..62339cb 100755 --- a/rsem-calculate-expression +++ b/rsem-calculate-expression @@ -2,10 +2,12 @@ use Getopt::Long; use Pod::Usage; -use File::Basename; -use Switch; +use FindBin; +use lib $FindBin::Bin; use strict; +use rsem_perl_utils qw(runCommand collectResults showVersionInfo); + #const my $BURNIN = 200; my $NCV = 1000; @@ -59,6 +61,8 @@ my $keep_intermediate_files = 0; my $strand_specific = 0; +my $version = 0; + my $mTime = 0; my ($time_start, $time_end, $time_alignment, $time_rsem, $time_ci) = (0, 0, 0, 0, 0); @@ -84,9 +88,9 @@ GetOptions("keep-intermediate-files" => \$keep_intermediate_files, "bowtie-e=i" => \$E, "bowtie-m=i" => \$maxHits, "bowtie-chunkmbs=i" => \$chunkMbs, - "phred33-quals" => \$phred33, - "phred64-quals" => \$phred64, #solexa1.3-quals" => \$phred64, - "solexa-quals" => \$solexa, + "bowtie-phred33-quals" => \$phred33, + "bowtie-phred64-quals" => \$phred64, #solexa1.3-quals" => \$phred64, + "bowtie-solexa-quals" => \$solexa, "forward-prob=f" => \$probF, "fragment-length-min=i" => \$minL, "fragment-length-max=i" => \$maxL, @@ -102,11 +106,14 @@ GetOptions("keep-intermediate-files" => \$keep_intermediate_files, "calc-ci" => \$calcCI, "ci-memory=i" => \$NMB, "time" => \$mTime, + "version" => \$version, "q|quiet" => \$quiet, "h|help" => \$help) or pod2usage(-exitval => 2, -verbose => 2); -pod2usage(-verbose => 2) if ($help == 1); +my $dir = "$FindBin::Bin/"; +pod2usage(-verbose => 2) if ($help == 1); +&showVersionInfo($dir) if ($version == 1); #check parameters and options @@ -183,7 +190,6 @@ my ($mate_minL, $mate_maxL) = (1, $maxL); if ($bowtie_path ne "") { $bowtie_path .= "/"; } -my ($fn, $dir, $suf) = fileparse($0); my $command = ""; if (!$is_sam && !$is_bam) { @@ -242,12 +248,11 @@ if ($quiet) { $command .= " -q"; } &runCommand($command); $command = $dir."rsem-build-read-index $gap"; -switch($read_type) { - case 0 { $command .= " 0 $quiet $imdName\_alignable.fa"; } - case 1 { $command .= " 1 $quiet $imdName\_alignable.fq"; } - case 2 { $command .= " 0 $quiet $imdName\_alignable_1.fa $imdName\_alignable_2.fa"; } - case 3 { $command .= " 1 $quiet $imdName\_alignable_1.fq $imdName\_alignable_2.fq"; } -} +if ($read_type == 0) { $command .= " 0 $quiet $imdName\_alignable.fa"; } +elsif ($read_type == 1) { $command .= " 1 $quiet $imdName\_alignable.fq"; } +elsif ($read_type == 2) { $command .= " 0 $quiet $imdName\_alignable_1.fa $imdName\_alignable_2.fa"; } +elsif ($read_type == 3) { $command .= " 1 $quiet $imdName\_alignable_1.fq $imdName\_alignable_2.fq"; } +else { print "Impossible! read_type is not in [1,2,3,4]!\n"; exit(-1); } &runCommand($command); my $doesOpen = open(OUTPUT, ">$imdName.mparams"); @@ -273,8 +278,8 @@ if ($quiet) { $command .= " -q"; } &runCommand($command); -&collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level -&collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level +&collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level +&collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level if ($genBamF) { $command = $dir."sam/samtools sort $sampleName.transcript.bam $sampleName.transcript.sorted"; @@ -307,8 +312,8 @@ if ($calcCI || $var_opt) { if ($calcCI) { system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak1"); system("mv $sampleName.genes.results $imdName.genes.results.bak1"); - &collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level - &collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level + &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level + &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level $command = $dir."rsem-calculate-credibility-intervals $refName $imdName $statName $CONFIDENCE $NCV $NSPC $NMB"; $command .= " -p $nThreads"; @@ -317,8 +322,8 @@ if ($calcCI) { system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak2"); system("mv $sampleName.genes.results $imdName.genes.results.bak2"); - &collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level - &collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level + &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level + &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level } if ($mTime) { $time_end = time(); $time_ci = $time_end - $time_start; } @@ -341,63 +346,6 @@ if ($mTime) { close(OUTPUT); } -# command, {err_msg} -sub runCommand { - print $_[0]."\n"; - my $status = system($_[0]); - if ($status != 0) { - my $errmsg = ""; - if (scalar(@_) > 1) { $errmsg .= $_[1]."\n"; } - $errmsg .= "\"$_[0]\" failed! Plase check if you provide correct parameters/options for the pipeline!\n"; - print $errmsg; - exit(-1); - } - print "\n"; -} - -# inpF, outF -sub collectResults { - my $local_status; - my ($inpF, $outF); - my (@results, @ids) = (); - my $line; - my $cnt; - - $inpF = $_[0]; - $outF = $_[1]; - - $local_status = open(INPUT, $inpF); - if ($local_status == 0) { print "Fail to open file $inpF!\n"; exit(-1); } - - $cnt = 0; - @results = (); - - while ($line = ) { - ++$cnt; - chomp($line); - my @local_arr = split(/\t/, $line); - if ($cnt == 4) { @ids = @local_arr; } - else { push(@results, \@local_arr); } - } - - push(@results, \@ids); - close(INPUT); - - $local_status = open(OUTPUT, ">$outF"); - if ($local_status == 0) { print "Fail to create file $outF!\n"; exit(-1); } - - my $n = scalar(@results); - my $m = scalar(@{$results[0]}); - for (my $i = 0; $i < $m; $i++) { - my @out_arr = (); - for (my $j = 0; $j < $n; $j++) { push(@out_arr, $results[$j][$i]); } - $" = "\t"; - print OUTPUT "@out_arr\n"; - } - close(OUTPUT); -} - - __END__ =head1 NAME @@ -406,14 +354,10 @@ rsem-calculate-expression =head1 SYNOPSIS -=over - - rsem-calculate-expression [options] upstream_read_file(s) reference_name sample_name - rsem-calculate-expression [options] --paired-end upstream_read_file(s) downstream_read_file(s) reference_name sample_name + rsem-calculate-expression [options] upstream_read_file(s) reference_name sample_name + rsem-calculate-expression [options] --paired-end upstream_read_file(s) downstream_read_file(s) reference_name sample_name rsem-calculate-expression [options] --sam/--bam [--paired-end] input reference_name sample_name -=back - =head1 ARGUMENTS =over @@ -516,17 +460,17 @@ The path to the bowtie executables. (Default: the path to the bowtie executables (Bowtie parameter) memory allocated for best first alignment calculation (Default: 0 - use bowtie's default) -=item B<--phred33-quals> +=item B<--bowtie-phred33-quals> -Input quality scores are encoded as Phred+33. (Default: on) +(Bowtie parameter) Input quality scores are encoded as Phred+33. (Default: on) -=item B<--phred64-quals> +=item B<--bowtie-phred64-quals> -Input quality scores are encoded as Phred+64 (default for GA Pipeline ver. >= 1.3). (Default: off) +(Bowtie parameter) Input quality scores are encoded as Phred+64 (default for GA Pipeline ver. >= 1.3). (Default: off) -=item B<--solexa-quals> +=item B<--bowtie-solexa-quals> -Input quality scores are solexa encoded (from GA Pipeline ver. < 1.3). (Default: off) +(Bowtie parameter) Input quality scores are solexa encoded (from GA Pipeline ver. < 1.3). (Default: off) =item B<--forward-prob> @@ -608,33 +552,85 @@ With the '--calc-ci' option, 95% credibility intervals and posterior mean estima =over -=item B +=item B -File containing gene level expression estimates. The format of each -line in this file is: +File containing isoform level expression estimates. The first line +contains column names separated by the tab character. The format of +each line in the rest of this file is: -gene_id expected_counts tau_value [pmc_value tau_pme_value tau_ci_lower_bound tau_ci_upper_bound] transcript_id_list +transcript_id gene_id length effective_length expected_count TPM FPKM IsoPct [pme_expected_count pme_TPM pme_FPKM IsoPct_from_pme_TPM TPM_ci_lower_bound TPM_ci_upper_bound FPKM_ci_lower_bound FPKM_ci_upper_bound] Fields are separated by the tab character. Fields within "[]" are only -presented if '--calc-ci' is set. pme stands for posterior mean -estimation. pmc stands for posterior mean counts. ci_lower_bound(l) -means the lower bound of the credibility intervals, ci_upper_bound(u) -means the upper bound of the credibility intervals. So the credibility -interval is [l, u]. 'transcript_id_list' is a space-separated list of -transcript_ids belonging to the gene. If no gene information is -provided, this file has the same content as -'sample_name.isoforms.results'. +presented if '--calc-ci' is set. -=item B +'transcript_id' is the transcript name of this transcript. 'gene_id' +is the gene name of the gene which this transcript belongs to (denote +this gene as its parent gene). If no gene information is provided, +'gene_id' and 'transcript_id' are the same. + +'length' is this transcript's sequence length (poly(A) tail is not +counted). 'effective_length' counts only the positions that can +generate a valid fragment. If no poly(A) tail is added, +'effective_length' is equal to transcript length - mean fragment +length + 1. If one transcript's effective length is less than 1, this +transcript's both effective length and abundance estimates are set to +0. + +'expected_count' is the sum of the posterior probability of each read +comes from this transcript over all reads. Because 1) each read +aligning to this transcript has a probability of being generated from +background noise; 2) RSEM may filter some alignable low quality reads, +the sum of expected counts for all transcript are generally less than +the total number of reads aligned. + +'TPM' stands for Transcripts Per Million. It is a relative measure of +transcript abundance. The sum of all transcripts' TPM is 1 +million. 'FPKM' stands for Fragments Per Kilobase of transcript per +Million mapped reads. It is another relative measure of transcript +abundance. If we define l_bar be the mean transcript length in a +sample, which can be calculated as + +l_bar = \sum_i TPM_i / 10^6 * effective_length_i (i goes through every transcript), -File containing isoform level expression values. The format of each -line in this file is: +the following equation is hold: + +FPKM_i = 10^3 / l_bar * TPM_i. + +We can see that the sum of FPKM is not a constant across samples. + +'IsoPct' stands for isoform percentage. It is the percentage of this +transcript's abandunce over its parent gene's abandunce. If its parent +gene has only one isoform or the gene information is not provided, +this field will be set to 100. + +'pme_expected_count', 'pme_TPM', 'pme_FPKM' are posterior mean +estimates calculated by RSEM's Gibbs sampler. 'IsoPct_from_pme_TPM' is +the isoform percentage calculated from 'pme_TPM' values. + +'TPM_ci_lower_bound', 'TPM_ci_upper_bound', 'FPKM_ci_lower_bound' and +'FPKM_ci_upper_bound' are lower(l) and upper(u) bounds of 95% +credibility intervals for TPM and FPKM values. The bounds are +inclusive (i.e. [l, u]). + +=item B + +File containing gene level expression estimates. The first line +contains column names separated by the tab character. The format of +each line in the rest of this file is: + +gene_id transcript_id(s) length effective_length expected_count TPM FPKM [pme_expected_count pme_TPM pme_FPKM TPM_ci_lower_bound TPM_ci_upper_bound FPKM_ci_lower_bound FPKM_ci_upper_bound] + +Fields are separated by the tab character. Fields within "[]" are only +presented if '--calc-ci' is set. -transcript_id expected_counts tau_value [pmc_value tau_pme_value tau_ci_lower_bound tau_ci_upper_bound] gene_id +'transcript_id(s)' is a comma-separated list of transcript_ids +belonging to this gene. If no gene information is provided, 'gene_id' +and 'transcript_id(s)' are identical (the 'transcript_id'). -Fields are separated by the tab character. 'gene_id' is the gene_id of -the gene which this transcript belongs to. If no gene information is -provided, 'gene_id' and 'transcript_id' are the same. +A gene's 'length' and 'effective_length' are +defined as the weighted average of its transcripts' lengths and +effective lengths (weighted by 'IsoPct'). A gene's abundance estimates +are just the sum of its transcripts' abundance estimates. =item B