Fixed a typo in rsem-calculate-expression's document; Added multi-thread and memory...

[rsem.git] / rsem-prepare-reference
diff --git a/rsem-prepare-reference b/rsem-prepare-reference

index 78743e946f2b2f1b6b02ecdb6cbeeaaebdf3c70e..81fd13ea74edab4b31dd147db9be502e62321167 100755 (executable)
--- a/rsem-prepare-reference
+++ b/rsem-prepare-reference
@@ -2,9 +2,12 @@
  
  use Getopt::Long;
  use Pod::Usage;        
-use File::Basename;
+use FindBin;
+use lib $FindBin::Bin;
  use strict;
  
+use rsem_perl_utils;
+
  my $status;
  
  my $gtfF = "";
@@ -16,6 +19,8 @@ my $polyALen = 125;
  my $bowtie_path = "";
  my $no_bowtie = 0;
  my $no_ntog = 0; 
+my $bowtie2 = 0;
+my $bowtie2_path = "";
  my $quiet = 0;
  my $help = 0;
  
@@ -27,16 +32,22 @@ GetOptions("gtf=s" => \$gtfF,
            "bowtie-path=s" => \$bowtie_path,
            "no-bowtie" => \$no_bowtie,
            "no-ntog" => \$no_ntog,
+          "bowtie2" => \$bowtie2,
+          "bowtie2-path=s" => \$bowtie2_path,
            "q|quiet" => \$quiet,
            "h|help" => \$help) or pod2usage(-exitval => 2, -verbose => 2);
  
  pod2usage(-verbose => 2) if ($help == 1);
  pod2usage(-msg => "Set --no-polyA & --no-polyA-subset at the same time!", -exitval => 2, -verbose => 2) if ($no_polyA == 1 && $subsetFile ne '');
  pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 2);
-pod2usage(-msg => "If bowtie is used, --no-ntog cannot be set!", -exitval => 2, -verbose => 2) if (!$no_bowtie && $no_ntog);
  
-if ($no_bowtie && $bowtie_path ne "") { print "Warning: If bowtie is not used, no need to set --bowtie-path option!\n"; }
+if ($bowtie2) { $no_bowtie = 1; $no_ntog = 1; }
  
+pod2usage(-msg => "If bowtie is used, --no-ntog cannot be set!", -exitval => 2, -verbose => 2) if (!$no_bowtie && $no_ntog);
+
+if ($no_bowtie && ($bowtie_path ne "")) { print "Warning: If bowtie is not used, no need to set --bowtie-path option!\n"; }
+if (!$bowtie2 && ($bowtie2_path ne "")) { print "Warning: If bowtie2 is not used, no need to set --bowtie2-path option!\n"; }
+ 
  my $type;
  
  if ($gtfF ne "") { $type = 0; }
@@ -57,8 +68,9 @@ if ($no_polyA) { $polyAChoice = 1 }
  elsif ($subsetFile ne "") { $polyAChoice = 2; }
  
  if ($bowtie_path ne "") { $bowtie_path .= "/"; }
+if ($bowtie2_path ne "") { $bowtie2_path .= "/"; }
  
-my ($fn, $dir, $suf) = fileparse($0); 
+my $dir = "$FindBin::Bin/";
  my $command = "";
  
  if ($type == 0) {
@@ -93,18 +105,12 @@ if (!$no_bowtie) {
      &runCommand($command);
  }
  
-# command, {err_msg}
-sub runCommand {
-    print $_[0]."\n";
-    my $status = system($_[0]);
-    if ($status != 0) { 
-       my $errmsg;
-       if (scalar(@_) > 1) { $errmsg = $_[1]; }
-       else { $errmsg = "\"$command\" failed! Plase check if you provide correct parameters/options for the pipeline!"; }
-       print $errmsg."\n";
-       exit(-1);
-    }
-    print "\n";
+if ($bowtie2) { 
+    $command = $bowtie2_path."bowtie2-build -f";
+    if ($quiet) { $command .= " -q"; }
+    $command .= " $ARGV[1].idx.fa $ARGV[1]";
+    
+    &runCommand($command);
  }
  
  __END__
@@ -115,11 +121,7 @@ rsem-prepare-reference
  
  =head1 SYNOPSIS
  
-=over
-
- rsem-prepare-reference [options] reference_fasta_file(s) reference_name
-
-=back
+rsem-prepare-reference [options] reference_fasta_file(s) reference_name
  
  =head1 ARGUMENTS
  
@@ -155,7 +157,7 @@ Each line of <file> should be of the form:
  gene_id transcript_id
  
  with the two fields separated by a tab character.
- 
+
  If you are using a GTF file for the "UCSC Genes" gene set from the UCSC Genome Browser, then the "knownIsoforms.txt" file (obtained from the "Downloads" section of the UCSC Genome Browser site) is of this format.
  
  If this option is off, then the mapping of isoforms to genes depends on whether the --gtf option is specified.  If --gtf is specified, then RSEM uses the "gene_id" and "transcript_id" attributes in the GTF file.  Otherwise, RSEM assumes that each sequence in the reference sequence files is a separate gene.
@@ -164,7 +166,7 @@ If this option is off, then the mapping of isoforms to genes depends on whether
  
  =item B<--no-polyA>
  
-Do not add poly(A) tails to the end of reference isoforms. (Default: add poly(A) tails to all transcripts)
+Do not add poly(A) tails to the end of reference isoforms. (Default: adding poly(A) tails to all transcripts)
  
  =item B<--no-polyA-subset> <file>
  
@@ -180,12 +182,19 @@ The path to the Bowtie executables. (Default: the path to Bowtie executables is
  
  =item B<--no-bowtie>
  
-Do not build Bowtie indices.  Specify this option if you wish to use an alternative aligner for mapping reads to transcripts.  You should align against the sequences generated in the output file 'reference_name.idx.fa'.  (Default: off)
+Do not build Bowtie indices.  Specify this option if you wish to use an alternative aligner for mapping reads to transcripts.  You should align against the sequences generated in the output file 'reference_name.idx.fa'. (Default: off)
  
  =item B<--no-ntog>
  
-Disable the conversion of 'N' characters to 'G' characters in the reference sequences.  This conversion is normally desired because it allows some aligners (e.g., Bowtie) to align against all positions in the reference.
-(Default: off)
+Disable the conversion of 'N' characters to 'G' characters in the reference sequences prepared for aligners.  This conversion is in particular desired for Bowtie aligner to align against all positions in the reference. (Default: off)
+
+=item B<--bowtie2>
+
+Build Bowtie 2 indices instead of Bowtie indices. Turn on this option will automatically turn on '--no-bowtie' and '--no-ntog' options. (Default: off)
+
+=item B<--bowtie2-path>
+
+The path to the Bowtie 2 executables. (Default: the path to Bowtie 2 executables is assumed to be in the user's PATH environment variable)
  
  =item B<-q/--quiet>
  
@@ -205,10 +214,12 @@ This program extracts/preprocesses the reference sequences and builds Bowtie ind
  
  This program will generate 'reference_name.grp', 'reference_name.ti', 'reference_name.transcripts.fa', 'reference_name.seq', 'reference_name.chrlist' (if '--gtf' is on), 'reference_name.idx.fa', and corresponding Bowtie index files (unless '--no-bowtie' is specified).
  
-'reference_name.grp', 'reference_name.ti', 'reference_name.seq', 'reference_name.idx.fa', and 'reference_name.chrlist' are used by RSEM internally.
+'reference_name.grp', 'reference_name.ti', 'reference_name.seq', and 'reference_name.chrlist' are used by RSEM internally.
  
-B<'reference_name.transcripts.fa'> contains the extracted reference transcripts in FASTA format. Poly(A) tails are not added.
+B<'reference_name.transcripts.fa'> contains the extracted reference transcripts in FASTA format. Poly(A) tails are added unless '--no-polyA' is set.
  
+B<'reference_name.idx.fa'> is used by aligners to build their own indices. If '--no-ntog' is set, this file should be identical to 'reference_name.transcripts.fa'.
+ 
  =head1 EXAMPLES
  
  1) Suppose we have mouse RNA-Seq data and want to use the UCSC mm9 version of the mouse genome. We have downloaded the UCSC Genes transcript annotations in GTF format (as mm9.gtf) using the Table Browser and the knownIsoforms.txt file for mm9 from the UCSC Downloads. We also have all chromosome files for mm9 in the directory '/data/mm9'.  We want to put the generated reference files under '/ref' with name 'mouse_125'. We'll add poly(A) tails with length 125. Please note that GTF files generated from UCSC's Table Browser do not contain isoform-gene relationship information.  For the UCSC Genes annotation, this information can be obtained from the knownIsoforms.txt file.  Suppose we want to build Bowtie indices and Bowtie executables are found in '/sw/bowtie'.