X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=rsem-calculate-expression;h=249ca8baa06d6f7067ffd9f7548c13fb99a7ad51;hb=86e650e9577999a7ba00ab454d1f6bf674b0ea70;hp=0f22282dfc8f0b4ea259d057b7aca3ebdc2528ce;hpb=01ba1b045976306e502aaf784c4a1d0787f12ac5;p=rsem.git diff --git a/rsem-calculate-expression b/rsem-calculate-expression index 0f22282..249ca8b 100755 --- a/rsem-calculate-expression +++ b/rsem-calculate-expression @@ -15,7 +15,7 @@ my $NSPC = 50; my $NMB = 1024; # default -my $status; +my $status = 0; my $read_type = 1; # default, single end with qual @@ -55,6 +55,9 @@ my $keep_intermediate_files = 0; my $strand_specific = 0; +my $mTime = 0; +my ($time_start, $time_end, $time_alignment, $time_rsem, $time_ci) = (0, 0, 0, 0, 0); + GetOptions("keep-intermediate-files" => \$keep_intermediate_files, "no-qualities" => \$no_qual, "paired-end" => \$paired_end, @@ -82,6 +85,7 @@ GetOptions("keep-intermediate-files" => \$keep_intermediate_files, "out-bam" => \$genBamF, "calc-ci" => \$calcCI, "ci-memory=i" => \$NMB, + "time" => \$mTime, "q|quiet" => \$quiet, "h|help" => \$help) or pod2usage(-exitval => 2, -verbose => 2); @@ -106,6 +110,7 @@ pod2usage(-msg => "Min fragment length should be at least 1!", -exitval => 2, -v pod2usage(-msg => "Min fragment length should be smaller or equal to max fragment length!", -exitval => 2, -verbose => 2) if ($minL > $maxL); pod2usage(-msg => "The memory allocated for calculating credibility intervals should be at least 1 MB!\n", -exitval => 2, -verbose => 2) if ($NMB < 1); pod2usage(-msg => "Number of threads should be at least 1!\n", -exitval => 2, -verbose => 2) if ($nThreads < 1); +pod2usage(-msg => "Seed length should be at least 25!\n", -exitval => 2, -verbose => 2) if ($L < 25); if ($strand_specific) { $probF = 1.0; } @@ -113,7 +118,7 @@ my $mate1_list = ""; my $mate2_list = ""; my $inpF = ""; -my ($refName, $taskName, $tmp_dir, $imdName) = (); +my ($refName, $sampleName, $sampleToken, $temp_dir, $stat_dir, $imdName) = (); my $gap = 32; if ($paired_end) { @@ -129,30 +134,31 @@ if (scalar(@ARGV) == 3) { if ($is_sam || $is_bam) { $inpF = $ARGV[0]; } else {$mate1_list = $ARGV[0]; } $refName = $ARGV[1]; - $taskName = $ARGV[2]; + $sampleName = $ARGV[2]; } else { $mate1_list = $ARGV[0]; $mate2_list = $ARGV[1]; $refName = $ARGV[2]; - $taskName = $ARGV[3]; + $sampleName = $ARGV[3]; } -$tmp_dir = $taskName.".temp"; -my $pos = rindex($taskName, '/'); -if ($pos < 0) { - $imdName = "$tmp_dir/$taskName"; -} -else { - $imdName = $tmp_dir."/".substr($taskName, $pos + 1); -} +my $pos = rindex($sampleName, '/'); +if ($pos < 0) { $sampleToken = $sampleName; } +else { $sampleToken = substr($sampleName, $pos + 1); } + +$temp_dir = "$sampleName.temp"; +$stat_dir = "$sampleName.stat"; + +if (!(-d $temp_dir) && !mkdir($temp_dir)) { print "Fail to create folder $temp_dir.\n"; exit(-1); } +if (!(-d $stat_dir) && !mkdir($stat_dir)) { print "Fail to create folder $stat_dir.\n"; exit(-1); } + +$imdName = "$temp_dir/$sampleToken"; if (!$is_sam && !$is_bam && $phred33 + $phred64 + $solexa == 0) { $phred33 = 1; } my ($mate_minL, $mate_maxL) = (1, $maxL); -if (!(-d $tmp_dir) && !mkdir($tmp_dir)) { print "Fail to create the directory.\n"; exit(-1); } - if ($bowtie_path ne "") { $bowtie_path .= "/"; } my ($fn, $dir, $suf) = fileparse($0); @@ -173,7 +179,7 @@ if (!$is_sam && !$is_bam) { if ($read_type == 2 || $read_type == 3) { $command .= " -I $minL -X $maxL"; } if ($strand_specific || $probF == 1.0) { $command .= " --norc"; } - elsif ($probF = 0.0) { $command .= " --nofw"; } + elsif ($probF == 0.0) { $command .= " --nofw"; } $command .= " -p $nThreads -a -m $maxHits -S"; if ($quiet) { $command .= " --quiet"; } @@ -188,7 +194,13 @@ if (!$is_sam && !$is_bam) { $command .= " | gzip > $imdName.sam.gz"; print "$command\n"; + + if ($mTime) { $time_start = time(); } + $status = system($command); + + if ($mTime) { $time_end = time(); $time_alignment = $time_end - $time_start; } + if ($status != 0) { print "bowtie failed! Please check if you provide correct parameters/options for the pipeline!\n"; exit(-1); @@ -199,7 +211,9 @@ if (!$is_sam && !$is_bam) { $is_sam = 1; # output of bowtie is a sam file } -$command = $dir."rsem-parse-alignments $refName $imdName"; +if ($mTime) { $time_start = time(); } + +$command = $dir."rsem-parse-alignments $refName $sampleName $sampleToken"; my $samInpType; if ($is_sam) { $samInpType = "s"; } @@ -233,8 +247,8 @@ if ($status != 0) { } print "\n"; -$status = open(OUTPUT, ">$imdName.mparams"); -if ($status == 0) { print "Cannot generate $imdName.mparams!\n"; exit(-1); } +my $doesOpen = open(OUTPUT, ">$imdName.mparams"); +if ($doesOpen == 0) { print "Cannot generate $imdName.mparams!\n"; exit(-1); } print OUTPUT "$minL $maxL\n"; print OUTPUT "$probF\n"; print OUTPUT "$estRSPD\n"; @@ -244,7 +258,7 @@ print OUTPUT "$mean $sd\n"; print OUTPUT "$L\n"; close(OUTPUT); -$command = $dir."rsem-run-em $refName $read_type $imdName $taskName -p $nThreads"; +$command = $dir."rsem-run-em $refName $read_type $sampleName $sampleToken -p $nThreads"; if ($genBamF) { $command .= " -b $samInpType $inpF"; if ($fn_list ne "") { $command .= " 1 $fn_list"; } @@ -262,7 +276,7 @@ if ($status != 0) { print "\n"; if ($genBamF) { - $command = $dir."sam/samtools sort $taskName.bam $taskName.sorted"; + $command = $dir."sam/samtools sort $sampleName.bam $sampleName.sorted"; print "$command\n"; $status = system($command); if ($status != 0) { @@ -270,7 +284,7 @@ if ($genBamF) { exit(-1); } print "\n"; - $command = $dir."sam/samtools index $taskName.sorted.bam"; + $command = $dir."sam/samtools index $sampleName.sorted.bam"; print "$command\n"; $status = system($command); if ($status != 0) { @@ -280,11 +294,16 @@ if ($genBamF) { print "\n"; } -&collectResults("$imdName.iso_res", "$taskName.isoforms.results"); # isoform level -&collectResults("$imdName.gene_res", "$taskName.genes.results"); # gene level +&collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level +&collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level + +if ($mTime) { $time_end = time(); $time_rsem = $time_end - $time_start; } + +if ($mTime) { $time_start = time(); } if ($calcCI) { - $command = $dir."rsem-run-gibbs $refName $taskName $imdName $BURNIN $CHAINLEN $SAMPLEGAP"; + $command = $dir."rsem-run-gibbs $refName $sampleName $sampleToken $BURNIN $CHAINLEN $SAMPLEGAP"; +# $command .= " -p $nThreads"; if ($quiet) { $command .= " -q"; } print "$command\n"; $status = system($command); @@ -294,12 +313,12 @@ if ($calcCI) { } print "\n"; - system("mv $taskName.isoforms.results $imdName.isoforms.results.bak1"); - system("mv $taskName.genes.results $imdName.genes.results.bak1"); - &collectResults("$imdName.iso_res", "$taskName.isoforms.results"); # isoform level - &collectResults("$imdName.gene_res", "$taskName.genes.results"); # gene level + system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak1"); + system("mv $sampleName.genes.results $imdName.genes.results.bak1"); + &collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level + &collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level - $command = $dir."rsem-calculate-credibility-intervals $refName $taskName $imdName $CONFIDENCE $NSPC $NMB"; + $command = $dir."rsem-calculate-credibility-intervals $refName $sampleName $sampleToken $CONFIDENCE $NSPC $NMB"; if ($quiet) { $command .= " -q"; } print "$command\n"; $status = system($command); @@ -309,20 +328,36 @@ if ($calcCI) { } print "\n"; - system("mv $taskName.isoforms.results $imdName.isoforms.results.bak2"); - system("mv $taskName.genes.results $imdName.genes.results.bak2"); - &collectResults("$imdName.iso_res", "$taskName.isoforms.results"); # isoform level - &collectResults("$imdName.gene_res", "$taskName.genes.results"); # gene level + system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak2"); + system("mv $sampleName.genes.results $imdName.genes.results.bak2"); + &collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level + &collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level } +if ($mTime) { $time_end = time(); $time_ci = $time_end - $time_start; } + +if ($mTime) { $time_start = time(); } + if (!$keep_intermediate_files) { - $status = system ("rm -rf $tmp_dir"); + $status = system("rm -rf $temp_dir"); if ($status != 0) { print "Fail to delete the temporary folder!\n"; exit(-1); } } +if ($mTime) { $time_end = time(); } + +if ($mTime) { + open(OUTPUT, ">$sampleName.time"); + print OUTPUT "Alignment: $time_alignment s.\n"; + print OUTPUT "RSEM: $time_rsem s.\n"; + print OUTPUT "CI: $time_ci s.\n"; + my $time_del = $time_end - $time_start; + print OUTPUT "Delete: $time_del s.\n"; + close(OUTPUT); +} + # inpF, outF sub collectResults { my $local_status; @@ -450,7 +485,7 @@ Calculate 95% credibility intervals and posterior mean estimates. (Default: off =item B<--seed-length> -Seed length used by the read aligner. Providing the correct value for this parameter is important for RSEM's accuracy if the data are single-end reads. If RSEM runs Bowtie, it uses this value for Bowtie's seed length parameter. (Default: 25) +Seed length used by the read aligner. Providing the correct value for this parameter is important for RSEM's accuracy if the data are single-end reads. If RSEM runs Bowtie, it uses this value for Bowtie's seed length parameter. The minimum value is 25. Any read with its or at least one of its mates' (for paired-end reads) length less than 25 will be ignored. (Default: 25) =item B<--tag> @@ -582,11 +617,6 @@ file. If no other attributes are given or no GTF file is provided in 'rsem-prepare-reference', there will be no tab after the tau_value field. -=item B and B - -Output files used by RSEM internally for tasks like simulation, -compute credibility intervals etc. - =item B Only generated when --out-bam is specified. @@ -595,7 +625,7 @@ Only generated when --out-bam is specified. genomic coordinates. Alignments of reads that have identical genomic coordinates (i.e., alignments to different isoforms that share the same genomic region) are collapsed into one alignment. The MAPQ field -of each alignment is set to max(100, floor(-10 * log10(1.0 - w) + +of each alignment is set to min(100, floor(-10 * log10(1.0 - w) + 0.5)), where w is the posterior probability of that alignment being the true mapping of a read. In addition, RSEM pads a new tag ZW:f:value, where value is a single precision floating number @@ -604,6 +634,9 @@ representing the posterior probability. 'sample_name.sorted.bam' and 'sample_name.sorted.bam.bai' are the sorted BAM file and indices generated by samtools (included in RSEM package). +=item B + +This is a folder instead of a file. All model related statistics are stored in this folder. Use 'rsem-plot-model' can generate plots using this folder. =back @@ -661,3 +694,4 @@ Assume the path to the bowtie executables is in the user's PATH environment vari mmliver_paired_end_quals =cut +