X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=rsem-calculate-expression;h=d724167a55e03b04952405ebf22db2e8751f629a;hb=66a1547e5d549a915cefb729222be915b87fb34d;hp=3f2817d083fac4605cbcb0ded7baaa6e6900db99;hpb=58d504aaf36ae486b1dba6d03e0e9f1c25855037;p=rsem.git

diff --git a/rsem-calculate-expression b/rsem-calculate-expression
index 3f2817d..d724167 100755
--- a/rsem-calculate-expression
+++ b/rsem-calculate-expression
@@ -61,6 +61,12 @@ my $keep_intermediate_files = 0;
 
 my $strand_specific = 0;
 
+my $bowtie2 = 0;
+my $bowtie2_path = "";
+my $bowtie2_mismatch_rate = 0.1;
+my $bowtie2_k = 200;
+my $bowtie2_sensitivity_level = "sensitive"; # must be one of "very_fast", "fast", "sensitive", "very_sensitive"
+
 my $version = 0;
 
 my $mTime = 0;
@@ -88,9 +94,14 @@ GetOptions("keep-intermediate-files" => \$keep_intermediate_files,
 	   "bowtie-e=i" => \$E,
 	   "bowtie-m=i" => \$maxHits,
 	   "bowtie-chunkmbs=i" => \$chunkMbs,
-	   "bowtie-phred33-quals" => \$phred33,
-	   "bowtie-phred64-quals" => \$phred64, #solexa1.3-quals" => \$phred64,
-	   "bowtie-solexa-quals" => \$solexa,
+	   "phred33-quals" => \$phred33,
+	   "phred64-quals" => \$phred64, #solexa1.3-quals" => \$phred64,
+	   "solexa-quals" => \$solexa,
+	   "bowtie2" => \$bowtie2,
+	   "bowtie2-path=s" => \$bowtie2_path,
+	   "bowtie2-mismatch-rate=f" => \$bowtie2_mismatch_rate,
+	   "bowtie2-k=i" => \$bowtie2_k,
+	   "bowtie2-sensitivity-level=s" => \$bowtie2_sensitivity_level,
 	   "forward-prob=f" => \$probF,
 	   "fragment-length-min=i" => \$minL,
 	   "fragment-length-max=i" => \$maxL,
@@ -120,12 +131,17 @@ pod2usage(-verbose => 2) if ($help == 1);
 if ($is_sam || $is_bam) {
     pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 3);
     pod2usage(-msg => "--sam and --bam cannot be active at the same time!", -exitval => 2, -verbose => 2) if ($is_sam == 1&& $is_bam == 1);
-    pod2usage(-msg => "--bowtie-path, --bowtie-n, --bowtie-e, --bowtie-m, --phred33-quals, --phred64-quals or --solexa-quals cannot be set if input is SAM/BAM format!", -exitval => 2, -verbose => 2) if ($bowtie_path ne "" || $C != 2 || $E != 99999999 || $maxHits != 200 || $phred33 || $phred64 || $solexa);
+    pod2usage(-msg => "--bowtie-path, --bowtie-n, --bowtie-e, --bowtie-m, --phred33-quals, --phred64-quals, --solexa-quals, --bowtie2, --bowtie2-path, --bowtie2-mismatch-rate, --bowtie2-k and --bowtie2-sensitivity-level cannot be set if input is SAM/BAM format!", -exitval => 2, -verbose => 2) if ($bowtie_path ne "" || $C != 2 || $E != 99999999 || $maxHits != 200 || $phred33 || $phred64 || $solexa || $bowtie2 || $bowtie2_path ne "" || $bowtie2_mismatch_rate != 0.1 || $bowtie2_k != 200 || $bowtie2_sensitivity_level ne "sensitive");
 }
 else {
     pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (!$paired_end && scalar(@ARGV) != 3 || $paired_end && scalar(@ARGV) != 4);    
-    pod2usage(-msg => "Only one of --phred33-quals --phred64-quals/--solexa1.3-quals --solexa-suqls can be active!", -exitval => 2, -verbose => 2) if ($phred33 + $phred64 + $solexa > 1);    
-    podwusage(-msg => "--sam , --bam or --sam-header-info cannot be set if use bowtie aligner to produce alignments!", -exitval => 2, -verbose => 2) if ($is_sam || $is_bam || $fn_list ne "");
+    pod2usage(-msg => "If --no-qualities is set, neither --phred33-quals, --phred64-quals or --solexa-quals can be active!", -exitval => 2, -verbose => 2) if ($no_qual && ($phred33 + $phred64 + $solexa > 0));
+    pod2usage(-msg => "Only one of --phred33-quals, --phred64-quals, and --solexa-quals can be active!", -exitval => 2, -verbose => 2) if ($phred33 + $phred64 + $solexa > 1);    
+    pod2usage(-msg => "--sam , --bam or --sam-header-info cannot be set if use bowtie/bowtie2 aligner to produce alignments!", -exitval => 2, -verbose => 2) if ($is_sam || $is_bam || $fn_list ne "");
+    pod2usage(-msg => "--bowtie2-path, --bowtie2-mismatch-rate, --bowtie2-k and --bowtie2-sensitivity-level cannot be set if bowtie aligner is used!", -exitval => 2, -verbose => 2) if (!$bowtie2 && ($bowtie2_path ne "" || $bowtie2_mismatch_rate != 0.1 || $bowtie2_k != 200 || $bowtie2_sensitivity_level ne "sensitive"));
+    pod2usage(-msg => "--bowtie-path, --bowtie-n, --bowtie-e, --bowtie-m cannot be set if bowtie2 aligner is used!", -exitval => 2, -verbose => 2) if ($bowtie2 && ($bowtie_path ne "" || $C != 2 || $E != 99999999 || $maxHits != 200));
+    pod2usage(-msg => "Mismatch rate must be within [0, 1]!", -exitval => 2, -verbose => 2) if ($bowtie2 && ($bowtie2_mismatch_rate < 0.0 || $bowtie2_mismatch_rate > 1.0));
+    pod2usage(-msg => "Sensitivity level must be one of \"very_fast\", \"fast\", \"sensitive\", and \"very_sensitive\"!", -exitval => 2, -verbose => 2) if ($bowtie2 && (($bowtie2_sensitivity_level ne "very_fast") && ($bowtie2_sensitivity_level ne "fast") && ($bowtie2_sensitivity_level ne "sensitive") && ($bowtie2_sensitivity_level ne "very_sensitive"))); 
 }
 
 pod2usage(-msg => "Forward probability should be in [0, 1]!", -exitval => 2, -verbose => 2) if ($probF < 0 || $probF > 1);
@@ -189,39 +205,77 @@ if (!$is_sam && !$is_bam && !$no_qual && ($phred33 + $phred64 + $solexa == 0)) {
 my ($mate_minL, $mate_maxL) = (1, $maxL);
 
 if ($bowtie_path ne "") { $bowtie_path .= "/"; }
+if ($bowtie2_path ne "") { $bowtie2_path .= "/"; }
 
 my $command = "";
 
 if (!$is_sam && !$is_bam) {
-    $command = $bowtie_path."bowtie";
-    if ($no_qual) { $command .= " -f"; }
-    else { $command .= " -q"; }
-    
-    if ($phred33) { $command .= " --phred33-quals"; }
-    elsif ($phred64) { $command .= " --phred64-quals"; }
-    elsif ($solexa) { $command .= " --solexa-quals"; }
+    if (!$bowtie2) {
+	$command = $bowtie_path."bowtie";
+	if ($no_qual) { $command .= " -f"; }
+	else { $command .= " -q"; }
     
-    $command .= " -n $C -e $E -l $L";
-    if ($read_type == 2 || $read_type == 3) { $command .= " -I $minL -X $maxL"; }
-    if ($chunkMbs > 0) { $command .= " --chunkmbs $chunkMbs"; }
+	if ($phred33) { $command .= " --phred33-quals"; }
+	elsif ($phred64) { $command .= " --phred64-quals"; }
+	elsif ($solexa) { $command .= " --solexa-quals"; }
     
-    if ($strand_specific || $probF == 1.0) { $command .= " --norc"; }
-    elsif ($probF == 0.0) { $command .= " --nofw"; }
-
-    $command .= " -p $nThreads -a -m $maxHits -S";
-    if ($quiet) { $command .= " --quiet"; }    
-
-    $command .= " $refName";
-    if ($read_type == 0 || $read_type == 1) {
-	$command .= " $mate1_list"; 
+	$command .= " -n $C -e $E -l $L";
+	if ($read_type == 2 || $read_type == 3) { $command .= " -I $minL -X $maxL"; }
+	if ($chunkMbs > 0) { $command .= " --chunkmbs $chunkMbs"; }
+	
+	if ($strand_specific || $probF == 1.0) { $command .= " --norc"; }
+	elsif ($probF == 0.0) { $command .= " --nofw"; }
+	
+	$command .= " -p $nThreads -a -m $maxHits -S";
+	if ($quiet) { $command .= " --quiet"; }    
+
+	$command .= " $refName";
+	if ($read_type == 0 || $read_type == 1) {
+	    $command .= " $mate1_list"; 
+	}
+	else {
+	    $command .= " -1 $mate1_list -2 $mate2_list";
+	}
+
+	# pipe to samtools to generate a BAM file
+	$command .= " | $dir\sam/samtools view -S -b -o $imdName.bam -";
     }
     else {
-	$command .= " -1 $mate1_list -2 $mate2_list";
+	$command = $bowtie2_path."bowtie2";
+	if ($no_qual) { $command .= " -f"; }
+	else { $command .= " -q"; }
+
+	if ($phred33) { $command .= " --phred33"; }
+	elsif ($phred64) { $command .= " --phred64"; }
+	elsif ($solexa) { $command .= " --solexa-quals"; }
+
+	if ($bowtie2_sensitivity_level eq "very_fast") { $command .= " --very-fast"; }
+	elsif ($bowtie2_sensitivity_level eq "fast") { $command .= " --fast"; }
+	elsif ($bowtie2_sensitivity_level eq "sensitive") { $command .= " --sensitive"; }
+	else { $command .= " --very-sensitive"; }
+
+	$command .= " --dpad 0 --gbar 99999999 --mp 1,1 --np 1 --score-min L,0,-$bowtie2_mismatch_rate";  
+
+	if ($read_type == 2 || $read_type == 3) { $command .= " -I $minL -X $maxL --no-mixed --no-discordant"; }
+
+	if ($strand_specific || $probF == 1.0) { $command .= " --norc"; }
+	elsif ($probF == 0.0) { $command .= " --nofw"; }
+	
+	$command .= " -p $nThreads -k $bowtie2_k";
+	if ($quiet) { $command .= " --quiet"; }    
+
+	$command .= " -x $refName";
+	if ($read_type == 0 || $read_type == 1) {
+	    $command .= " -U $mate1_list"; 
+	}
+	else {
+	    $command .= " -1 $mate1_list -2 $mate2_list";
+	}
+
+	# pipe to samtools to generate a BAM file
+	$command .= " | $dir\sam/samtools view -S -b -o $imdName.bam -";
     }
 
-    # pipe to samtools to generate a BAM file
-    $command .= " | $dir\sam/samtools view -S -b -o $imdName.bam -";
-
     if ($mTime) { $time_start = time(); }
 
     &runCommand($command);
@@ -398,7 +452,7 @@ Input reads do not contain quality scores. (Default: off)
 
 =item B<--strand-specific>
 
-The RNA-Seq protocol used to generate the reads is strand specific, i.e., all (upstream) reads are derived from the forward strand.  This option is equivalent to --forward-prob=1.0.  With this option set, if RSEM runs the Bowtie aligner, the '--norc' Bowtie option will be used, which disables alignment to the reverse strand of transcripts.  (Default: off)
+The RNA-Seq protocol used to generate the reads is strand specific, i.e., all (upstream) reads are derived from the forward strand.  This option is equivalent to --forward-prob=1.0.  With this option set, if RSEM runs the Bowtie/Bowtie 2 aligner, the '--norc' Bowtie/Bowtie 2 option will be used, which disables alignment to the reverse strand of transcripts.  (Default: off)
 
 =item B<--sam>
 
@@ -414,7 +468,7 @@ RSEM reads header information from input by default. If this option is on, heade
 
 =item B<-p/--num-threads> <int>
 
-Number of threads to use. Both Bowtie and expression estimation will use this many threads. (Default: 1)
+Number of threads to use. Both Bowtie/Bowtie2 and expression estimation will use this many threads. (Default: 1)
 
 =item B<--no-bam-output>
 
@@ -442,7 +496,7 @@ The name of the optional field used in the SAM input for identifying a read with
 
 =item B<--bowtie-path> <path>
 
-The path to the bowtie executables. (Default: the path to the bowtie executables is assumed to be in the user's PATH environment variable)
+The path to the Bowtie executables. (Default: the path to the Bowtie executables is assumed to be in the user's PATH environment variable)
 
 =item B<--bowtie-n> <int>
 
@@ -458,19 +512,39 @@ The path to the bowtie executables. (Default: the path to the bowtie executables
 
 =item B<--bowtie-chunkmbs> <int>
 
-(Bowtie parameter) memory allocated for best first alignment calculation (Default: 0 - use bowtie's default)
+(Bowtie parameter) memory allocated for best first alignment calculation (Default: 0 - use Bowtie's default)
+
+=item B<--phred33-quals>
+
+Input quality scores are encoded as Phred+33. (Default: on)
+
+=item B<--phred64-quals>
+
+Input quality scores are encoded as Phred+64 (default for GA Pipeline ver. >= 1.3). (Default: off)
+
+=item B<--solexa-quals>
+
+Input quality scores are solexa encoded (from GA Pipeline ver. < 1.3). (Default: off)
+
+=item B<--bowtie2>
+
+Use Bowtie 2 instead of Bowtie to align reads. Since currently RSEM does not handle indel, local and discordant alignments, the Bowtie2 parameters are set in a way to avoid those alignments. In particular, we use options '--very-sensitive --dpad 0 --gbar 99999999 --mp 1,1 --np 1 --score-min L,0,-rate'. "-rate", the last parameter of '--score-min' is the negative value of the mismatch rate provided by option '--bowtie2-mismatch-rate'. If reads are paired-end, we additionally use options '--no-mixed' and '--no-discordant'. (Default: off)
+
+=item B<--bowtie2-path> <path>
+
+(Bowtie 2 parameter) The path to the Bowtie 2 executables. (Default: the path to the Bowtie 2 executables is assumed to be in the user's PATH environment variable)
 
-=item B<--bowtie-phred33-quals>
+=item B<--bowtie2-mismatch-rate> <double>
 
-(Bowtie parameter) Input quality scores are encoded as Phred+33. (Default: on)
+(Bowtie 2 parameter) The maximum mismatch rate allowed. (Default: 0.1)
 
-=item B<--bowtie-phred64-quals>
+=item B<--bowtie2-k> <int>
 
-(Bowtie parameter) Input quality scores are encoded as Phred+64 (default for GA Pipeline ver. >= 1.3). (Default: off)
+(Bowtie 2 parameter) Find up to <int> alignments per read. (Default: 200)
 
-=item B<--bowtie-solexa-quals>
+=item B<--bowtie2-sensitivity-level> <string>
 
-(Bowtie parameter) Input quality scores are solexa encoded (from GA Pipeline ver. < 1.3). (Default: off)
+(Bowtie 2 parameter) Set Bowtie 2's preset options in --end-to-end mode. This option controls how hard Bowtie 2 tries to find alignments. <string> must be one of "very_fast", "fast", "sensitive" and "very_sensitive". The four candidates correspond to Bowtie 2's "--very-fast", "--fast", "--sensitive" and "--very-sensitive" options. (Default: "sensitive" - use Bowtie 2's default)
 
 =item B<--forward-prob> <double>
 
@@ -478,11 +552,11 @@ Probability of generating a read from the forward strand of a transcript. Set to
 
 =item B<--fragment-length-min> <int>
 
-Minimum read/insert length allowed. This is also the value for the bowtie -I option. (Default: 1)
+Minimum read/insert length allowed. This is also the value for the Bowtie/Bowtie2 -I option. (Default: 1)
 
 =item B<--fragment-length-max> <int>
 
-Maximum read/insert length allowed. This is also the value for the bowtie -X option. (Default: 1000)
+Maximum read/insert length allowed. This is also the value for the Bowtie/Bowtie 2 -X option. (Default: 1000)
 
 =item B<--fragment-length-mean> <double>
 
@@ -718,7 +792,7 @@ Assume the path to the bowtie executables is in the user's PATH environment vari
                            /ref/mouse_125 \
                            mmliver_single_without_quals
 
-4) Data are the same as 1). This time we assume the bowtie executables are under '/sw/bowtie'. We want to take a fragment length distribution into consideration. We set the fragment length mean to 150 and the standard deviation to 35. In addition to a BAM file, we also want to generate credibility intervals.  We allow RSEM to use 1GB of memory for CI calculation:
+4) Data are the same as 1). This time we assume the bowtie executables are under '/sw/bowtie'. We want to take a fragment length distribution into consideration. We set the fragment length mean to 150 and the standard deviation to 35. In addition to a BAM file, we also want to generate credibility intervals. We allow RSEM to use 1GB of memory for CI calculation:
 
  rsem-calculate-expression --bowtie-path /sw/bowtie \
                            --phred64-quals \
@@ -732,7 +806,7 @@ Assume the path to the bowtie executables is in the user's PATH environment vari
                            /ref/mouse_125 \
                            mmliver_single_quals
 
-5) '/data/mmliver_paired_end_quals.bam', paired-end reads with quality scores.  We want to use 8 threads:
+5) '/data/mmliver_paired_end_quals.bam', paired-end reads with quality scores. We want to use 8 threads:
 
  rsem-calculate-expression --paired-end \
                            --bam \