X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=rsem-calculate-expression;h=4fbb4a59beac6a449a90e4dc6335d7a9633f6dcc;hb=52f1bd6f44f9b2630b839f192fb9ece18581983b;hp=9e6ef2e99e1d72e7db053401b764d62f6e7bd5f8;hpb=6eec553ba4ab1cb1c6f48cd14d9e5c92c7d85798;p=rsem.git diff --git a/rsem-calculate-expression b/rsem-calculate-expression index 9e6ef2e..4fbb4a5 100755 --- a/rsem-calculate-expression +++ b/rsem-calculate-expression @@ -3,9 +3,10 @@ use Getopt::Long; use Pod::Usage; use File::Basename; -use Switch; use strict; +use rsem_perl_utils qw(runCommand collectResults showVersionInfo); + #const my $BURNIN = 200; my $NCV = 1000; @@ -59,6 +60,8 @@ my $keep_intermediate_files = 0; my $strand_specific = 0; +my $version = 0; + my $mTime = 0; my ($time_start, $time_end, $time_alignment, $time_rsem, $time_ci) = (0, 0, 0, 0, 0); @@ -102,11 +105,14 @@ GetOptions("keep-intermediate-files" => \$keep_intermediate_files, "calc-ci" => \$calcCI, "ci-memory=i" => \$NMB, "time" => \$mTime, + "version" => \$version, "q|quiet" => \$quiet, "h|help" => \$help) or pod2usage(-exitval => 2, -verbose => 2); -pod2usage(-verbose => 2) if ($help == 1); +my ($fn, $dir, $suf) = fileparse($0); +pod2usage(-verbose => 2) if ($help == 1); +&showVersionInfo($dir) if ($version == 1); #check parameters and options @@ -183,7 +189,6 @@ my ($mate_minL, $mate_maxL) = (1, $maxL); if ($bowtie_path ne "") { $bowtie_path .= "/"; } -my ($fn, $dir, $suf) = fileparse($0); my $command = ""; if (!$is_sam && !$is_bam) { @@ -242,12 +247,11 @@ if ($quiet) { $command .= " -q"; } &runCommand($command); $command = $dir."rsem-build-read-index $gap"; -switch($read_type) { - case 0 { $command .= " 0 $quiet $imdName\_alignable.fa"; } - case 1 { $command .= " 1 $quiet $imdName\_alignable.fq"; } - case 2 { $command .= " 0 $quiet $imdName\_alignable_1.fa $imdName\_alignable_2.fa"; } - case 3 { $command .= " 1 $quiet $imdName\_alignable_1.fq $imdName\_alignable_2.fq"; } -} +if ($read_type == 0) { $command .= " 0 $quiet $imdName\_alignable.fa"; } +elsif ($read_type == 1) { $command .= " 1 $quiet $imdName\_alignable.fq"; } +elsif ($read_type == 2) { $command .= " 0 $quiet $imdName\_alignable_1.fa $imdName\_alignable_2.fa"; } +elsif ($read_type == 3) { $command .= " 1 $quiet $imdName\_alignable_1.fq $imdName\_alignable_2.fq"; } +else { print "Impossible! read_type is not in [1,2,3,4]!\n"; exit(-1); } &runCommand($command); my $doesOpen = open(OUTPUT, ">$imdName.mparams"); @@ -273,8 +277,8 @@ if ($quiet) { $command .= " -q"; } &runCommand($command); -&collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level -&collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level +&collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level +&collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level if ($genBamF) { $command = $dir."sam/samtools sort $sampleName.transcript.bam $sampleName.transcript.sorted"; @@ -307,8 +311,8 @@ if ($calcCI || $var_opt) { if ($calcCI) { system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak1"); system("mv $sampleName.genes.results $imdName.genes.results.bak1"); - &collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level - &collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level + &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level + &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level $command = $dir."rsem-calculate-credibility-intervals $refName $imdName $statName $CONFIDENCE $NCV $NSPC $NMB"; $command .= " -p $nThreads"; @@ -317,8 +321,8 @@ if ($calcCI) { system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak2"); system("mv $sampleName.genes.results $imdName.genes.results.bak2"); - &collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level - &collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level + &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level + &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level } if ($mTime) { $time_end = time(); $time_ci = $time_end - $time_start; } @@ -341,63 +345,6 @@ if ($mTime) { close(OUTPUT); } -# command, {err_msg} -sub runCommand { - print $_[0]."\n"; - my $status = system($_[0]); - if ($status != 0) { - my $errmsg; - if (scalar(@_) > 1) { $errmsg = $_[1]; } - else { $errmsg = "\"$command\" failed! Plase check if you provide correct parameters/options for the pipeline!"; } - print $errmsg."\n"; - exit(-1); - } - print "\n"; -} - -# inpF, outF -sub collectResults { - my $local_status; - my ($inpF, $outF); - my (@results, @ids) = (); - my $line; - my $cnt; - - $inpF = $_[0]; - $outF = $_[1]; - - $local_status = open(INPUT, $inpF); - if ($local_status == 0) { print "Fail to open file $inpF!\n"; exit(-1); } - - $cnt = 0; - @results = (); - - while ($line = ) { - ++$cnt; - chomp($line); - my @local_arr = split(/\t/, $line); - if ($cnt == 4) { @ids = @local_arr; } - else { push(@results, \@local_arr); } - } - - push(@results, \@ids); - close(INPUT); - - $local_status = open(OUTPUT, ">$outF"); - if ($local_status == 0) { print "Fail to create file $outF!\n"; exit(-1); } - - my $n = scalar(@results); - my $m = scalar(@{$results[0]}); - for (my $i = 0; $i < $m; $i++) { - my @out_arr = (); - for (my $j = 0; $j < $n; $j++) { push(@out_arr, $results[$j][$i]); } - $" = "\t"; - print OUTPUT "@out_arr\n"; - } - close(OUTPUT); -} - - __END__ =head1 NAME @@ -608,33 +555,85 @@ With the '--calc-ci' option, 95% credibility intervals and posterior mean estima =over -=item B +=item B -File containing gene level expression estimates. The format of each -line in this file is: +File containing isoform level expression estimates. The first line +contains column names separated by the tab character. The format of +each line in the rest of this file is: -gene_id expected_counts tau_value [pmc_value tau_pme_value tau_ci_lower_bound tau_ci_upper_bound] transcript_id_list +transcript_id gene_id length effective_length expected_count TPM FPKM IsoPct [pme_expected_count pme_TPM pme_FPKM IsoPct_from_pme_TPM TPM_ci_lower_bound TPM_ci_upper_bound FPKM_ci_lower_bound FPKM_ci_upper_bound] Fields are separated by the tab character. Fields within "[]" are only -presented if '--calc-ci' is set. pme stands for posterior mean -estimation. pmc stands for posterior mean counts. ci_lower_bound(l) -means the lower bound of the credibility intervals, ci_upper_bound(u) -means the upper bound of the credibility intervals. So the credibility -interval is [l, u]. 'transcript_id_list' is a space-separated list of -transcript_ids belonging to the gene. If no gene information is -provided, this file has the same content as -'sample_name.isoforms.results'. +presented if '--calc-ci' is set. -=item B +'transcript_id' is the transcript name of this transcript. 'gene_id' +is the gene name of the gene which this transcript belongs to (denote +this gene as its parent gene). If no gene information is provided, +'gene_id' and 'transcript_id' are the same. + +'length' is this transcript's sequence length (poly(A) tail is not +counted). 'effective_length' counts only the positions that can +generate a valid fragment. If no poly(A) tail is added, +'effective_length' is equal to transcript length - mean fragment +length + 1. If one transcript's effective length is less than 1, this +transcript's both effective length and abundance estimates are set to +0. + +'expected_count' is the sum of the posterior probability of each read +comes from this transcript over all reads. Because 1) each read +aligning to this transcript has a probability of being generated from +background noise; 2) RSEM may filter some alignable low quality reads, +the sum of expected counts for all transcript are generally less than +the total number of reads aligned. + +'TPM' stands for Transcripts Per Million. It is a relative measure of +transcript abundance. The sum of all transcripts' TPM is 1 +million. 'FPKM' stands for Fragments Per Kilobase of transcript per +Million mapped reads. It is another relative measure of transcript +abundance. If we define l_bar be the mean transcript length in a +sample, which can be calculated as + +l_bar = \sum_i TPM_i / 10^6 * effective_length_i (i goes through every transcript), -File containing isoform level expression values. The format of each -line in this file is: +the following equation is hold: + +FPKM_i = 10^3 / l_bar * TPM_i. + +We can see that the sum of FPKM is not a constant across samples. + +'IsoPct' stands for isoform percentage. It is the percentage of this +transcript's abandunce over its parent gene's abandunce. If its parent +gene has only one isoform or the gene information is not provided, +this field will be set to 100. + +'pme_expected_count', 'pme_TPM', 'pme_FPKM' are posterior mean +estimates calculated by RSEM's Gibbs sampler. 'IsoPct_from_pme_TPM' is +the isoform percentage calculated from 'pme_TPM' values. + +'TPM_ci_lower_bound', 'TPM_ci_upper_bound', 'FPKM_ci_lower_bound' and +'FPKM_ci_upper_bound' are lower(l) and upper(u) bounds of 95% +credibility intervals for TPM and FPKM values. The bounds are +inclusive (i.e. [l, u]). + +=item B + +File containing gene level expression estimates. The first line +contains column names separated by the tab character. The format of +each line in the rest of this file is: + +gene_id transcript_id(s) length effective_length expected_count TPM FPKM [pme_expected_count pme_TPM pme_FPKM TPM_ci_lower_bound TPM_ci_upper_bound FPKM_ci_lower_bound FPKM_ci_upper_bound] + +Fields are separated by the tab character. Fields within "[]" are only +presented if '--calc-ci' is set. -transcript_id expected_counts tau_value [pmc_value tau_pme_value tau_ci_lower_bound tau_ci_upper_bound] gene_id +'transcript_id(s)' is a comma-separated list of transcript_ids +belonging to this gene. If no gene information is provided, 'gene_id' +and 'transcript_id(s)' are identical (the 'transcript_id'). -Fields are separated by the tab character. 'gene_id' is the gene_id of -the gene which this transcript belongs to. If no gene information is -provided, 'gene_id' and 'transcript_id' are the same. +A gene's 'length' and 'effective_length' are +defined as the weighted average of its transcripts' lengths and +effective lengths (weighted by 'IsoPct'). A gene's abundance estimates +are just the sum of its transcripts' abundance estimates. =item B