rsem-calculate-expression

   1 #!/usr/bin/perl
   2
   3 use Getopt::Long;
   4 use Pod::Usage;
   5 use File::Basename;
   6 use strict;
   7
   8 #const
   9 my $BURNIN = 200;
  10 my $NCV = 1000;
  11 my $SAMPLEGAP = 1;
  12 my $CONFIDENCE = 0.95;
  13 my $NSPC = 50;
  14
  15 my $NMB = 1024; # default
  16
  17 my $status = 0;
  18
  19 my $read_type = 1; # default, single end with qual
  20
  21 my @transcript_title = ("transcript_id", "gene_id", "length", "effective_length", "expected_count", "TPM", "FPKM", "IsoPct", "pme_expected_count", "pme_TPM", "pme_FPKM", "IsoPct_from_pme_TPM", "TPM_ci_lower_bound", "TPM_ci_upper_bound", "FPKM_ci_lower_bound", "FPKM_ci_upper_bound");
  22
  23 my @gene_title = ("gene_id", "transcript_id(s)", "length", "effective_length", "expected_count", "TPM", "FPKM", "pme_expected_count", "pme_TPM", "pme_FPKM", "TPM_ci_lower_bound", "TPM_ci_upper_bound", "FPKM_ci_lower_bound", "FPKM_ci_upper_bound");
  24
  25 my $bowtie_path = "";
  26 my $C = 2;
  27 my $E = 99999999;
  28 my $L = 25;
  29 my $maxHits = 200;
  30 my $chunkMbs = 0;       # 0 = use bowtie default
  31 my $phred33 = 0;
  32 my $phred64 = 0;
  33 my $solexa = 0;
  34
  35 my $is_sam = 0;
  36 my $is_bam = 0;
  37 my $fn_list = "";
  38 my $tagName = "XM";
  39
  40 my $probF = 0.5;
  41
  42 my $minL = 1;
  43 my $maxL = 1000;
  44 my $mean = -1;
  45 my $sd = 0;
  46
  47 my $estRSPD = 0;
  48 my $B = 20;
  49
  50 my $nThreads = 1;
  51 my $genBamF = 1;  # default is generating transcript bam file
  52 my $genGenomeBamF = 0;
  53 my $sampling = 0;
  54 my $calcCI = 0;
  55 my $var_opt = 0; # temporarily, only for internal use
  56 my $quiet = 0;
  57 my $help = 0;
  58
  59 my $paired_end = 0;
  60 my $no_qual = 0;
  61 my $keep_intermediate_files = 0;
  62
  63 my $strand_specific = 0;
  64
  65 my $version = 0;
  66
  67 my $mTime = 0;
  68 my ($time_start, $time_end, $time_alignment, $time_rsem, $time_ci) = (0, 0, 0, 0, 0);
  69
  70 my $mate1_list = "";
  71 my $mate2_list = "";
  72 my $inpF = "";
  73
  74 my ($refName, $sampleName, $sampleToken, $temp_dir, $stat_dir, $imdName, $statName) = ();
  75 my $gap = 32;
  76
  77 GetOptions("keep-intermediate-files" => \$keep_intermediate_files,
  78            "temporary-folder=s" => \$temp_dir,
  79            "no-qualities" => \$no_qual,
  80            "paired-end" => \$paired_end,
  81            "strand-specific" => \$strand_specific,
  82            "sam" => \$is_sam,
  83            "bam" => \$is_bam,
  84            "sam-header-info=s" => \$fn_list,
  85            "tag=s" => \$tagName,
  86            "seed-length=i" => \$L,
  87            "bowtie-path=s" => \$bowtie_path,
  88            "bowtie-n=i" => \$C,
  89            "bowtie-e=i" => \$E,
  90            "bowtie-m=i" => \$maxHits,
  91            "bowtie-chunkmbs=i" => \$chunkMbs,
  92            "phred33-quals" => \$phred33,
  93            "phred64-quals" => \$phred64, #solexa1.3-quals" => \$phred64,
  94            "solexa-quals" => \$solexa,
  95            "forward-prob=f" => \$probF,
  96            "fragment-length-min=i" => \$minL,
  97            "fragment-length-max=i" => \$maxL,
  98            "fragment-length-mean=f" => \$mean,
  99            "fragment-length-sd=f" => \$sd,
 100            "estimate-rspd" => \$estRSPD,
 101            "num-rspd-bins=i" => \$B,
 102            "p|num-threads=i" => \$nThreads,
 103            "no-bam-output" => sub { $genBamF = 0; },
 104            "output-genome-bam" => \$genGenomeBamF,
 105            "sampling-for-bam" => \$sampling,
 106            "var" => \$var_opt,
 107            "calc-ci" => \$calcCI,
 108            "ci-memory=i" => \$NMB,
 109            "time" => \$mTime,
 110            "version" => \$version,
 111            "q|quiet" => \$quiet,
 112            "h|help" => \$help) or pod2usage(-exitval => 2, -verbose => 2);
 113
 114 my ($fn, $dir, $suf) = fileparse($0);
 115
 116 pod2usage(-verbose => 2) if ($help == 1);
 117 &showVersionInfo($dir) if ($version == 1);
 118
 119 #check parameters and options
 120
 121 if ($is_sam || $is_bam) {
 122     pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 3);
 123     pod2usage(-msg => "--sam and --bam cannot be active at the same time!", -exitval => 2, -verbose => 2) if ($is_sam == 1&& $is_bam == 1);
 124     pod2usage(-msg => "--bowtie-path, --bowtie-n, --bowtie-e, --bowtie-m, --phred33-quals, --phred64-quals or --solexa-quals cannot be set if input is SAM/BAM format!", -exitval => 2, -verbose => 2) if ($bowtie_path ne "" || $C != 2 || $E != 99999999 || $maxHits != 200 || $phred33 || $phred64 || $solexa);
 125 }
 126 else {
 127     pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (!$paired_end && scalar(@ARGV) != 3 || $paired_end && scalar(@ARGV) != 4);
 128     pod2usage(-msg => "Only one of --phred33-quals --phred64-quals/--solexa1.3-quals --solexa-suqls can be active!", -exitval => 2, -verbose => 2) if ($phred33 + $phred64 + $solexa > 1);
 129     podwusage(-msg => "--sam , --bam or --sam-header-info cannot be set if use bowtie aligner to produce alignments!", -exitval => 2, -verbose => 2) if ($is_sam || $is_bam || $fn_list ne "");
 130 }
 131
 132 pod2usage(-msg => "Forward probability should be in [0, 1]!", -exitval => 2, -verbose => 2) if ($probF < 0 || $probF > 1);
 133 pod2usage(-msg => "Min fragment length should be at least 1!", -exitval => 2, -verbose => 2) if ($minL < 1);
 134 pod2usage(-msg => "Min fragment length should be smaller or equal to max fragment length!", -exitval => 2, -verbose => 2) if ($minL > $maxL);
 135 pod2usage(-msg => "The memory allocated for calculating credibility intervals should be at least 1 MB!\n", -exitval => 2, -verbose => 2) if ($NMB < 1);
 136 pod2usage(-msg => "Number of threads should be at least 1!\n", -exitval => 2, -verbose => 2) if ($nThreads < 1);
 137 pod2usage(-msg => "Seed length should be at least 5!\n", -exitval => 2, -verbose => 2) if ($L < 5);
 138 pod2usage(-msg => "--sampling-for-bam cannot be specified if --no-bam-output is specified!\n", -exitval => 2, -verbose => 2) if ($sampling && !$genBamF);
 139 pod2usage(-msg => "--output-genome-bam cannot be specified if --no-bam-output is specified!\n", -exitval => 2, -verbose => 2) if ($genGenomeBamF && !$genBamF);
 140
 141 if ($L < 25) { print "Warning: the seed length set is less than 25! This is only allowed if the references are not added poly(A) tails.\n"; }
 142
 143 if ($strand_specific) { $probF = 1.0; }
 144
 145 if ($paired_end) {
 146     if ($no_qual) { $read_type = 2; }
 147     else { $read_type = 3; }
 148 }
 149 else {
 150     if ($no_qual) { $read_type = 0; }
 151     else { $read_type = 1; }
 152 }
 153
 154 if (scalar(@ARGV) == 3) {
 155     if ($is_sam || $is_bam) { $inpF = $ARGV[0]; }
 156     else {$mate1_list = $ARGV[0]; }
 157     $refName = $ARGV[1];
 158     $sampleName = $ARGV[2];
 159 }
 160 else {
 161     $mate1_list = $ARGV[0];
 162     $mate2_list = $ARGV[1];
 163     $refName = $ARGV[2];
 164     $sampleName = $ARGV[3];
 165 }
 166
 167 if ($genGenomeBamF) {
 168     open(INPUT, "$refName.ti");
 169     my $line = <INPUT>; chomp($line);
 170     close(INPUT);
 171     my ($M, $type) = split(/ /, $line);
 172     pod2usage(-msg => "No genome information provided, so genome bam file cannot be generated!\n", -exitval => 2, -verbose => 2) if ($type != 0);
 173 }
 174
 175 my $pos = rindex($sampleName, '/');
 176 if ($pos < 0) { $sampleToken = $sampleName; }
 177 else { $sampleToken = substr($sampleName, $pos + 1); }
 178
 179 if ($temp_dir eq "") { $temp_dir = "$sampleName.temp"; }
 180 $stat_dir = "$sampleName.stat";
 181
 182 if (!(-d $temp_dir) && !mkdir($temp_dir)) { print "Fail to create folder $temp_dir.\n"; exit(-1); }
 183 if (!(-d $stat_dir) && !mkdir($stat_dir)) { print "Fail to create folder $stat_dir.\n"; exit(-1); }
 184
 185 $imdName = "$temp_dir/$sampleToken";
 186 $statName = "$stat_dir/$sampleToken";
 187
 188 if (!$is_sam && !$is_bam && !$no_qual && ($phred33 + $phred64 + $solexa == 0)) { $phred33 = 1; }
 189
 190 my ($mate_minL, $mate_maxL) = (1, $maxL);
 191
 192 if ($bowtie_path ne "") { $bowtie_path .= "/"; }
 193
 194 my $command = "";
 195
 196 if (!$is_sam && !$is_bam) {
 197     $command = $bowtie_path."bowtie";
 198     if ($no_qual) { $command .= " -f"; }
 199     else { $command .= " -q"; }
 200
 201     if ($phred33) { $command .= " --phred33-quals"; }
 202     elsif ($phred64) { $command .= " --phred64-quals"; }
 203     elsif ($solexa) { $command .= " --solexa-quals"; }
 204
 205     $command .= " -n $C -e $E -l $L";
 206     if ($read_type == 2 || $read_type == 3) { $command .= " -I $minL -X $maxL"; }
 207     if ($chunkMbs > 0) { $command .= " --chunkmbs $chunkMbs"; }
 208
 209     if ($strand_specific || $probF == 1.0) { $command .= " --norc"; }
 210     elsif ($probF == 0.0) { $command .= " --nofw"; }
 211
 212     $command .= " -p $nThreads -a -m $maxHits -S";
 213     if ($quiet) { $command .= " --quiet"; }
 214
 215     $command .= " $refName";
 216     if ($read_type == 0 || $read_type == 1) {
 217         $command .= " $mate1_list";
 218     }
 219     else {
 220         $command .= " -1 $mate1_list -2 $mate2_list";
 221     }
 222
 223     # pipe to samtools to generate a BAM file
 224     $command .= " | $dir\sam/samtools view -S -b -o $imdName.bam -";
 225
 226     if ($mTime) { $time_start = time(); }
 227
 228     &runCommand($command);
 229
 230     if ($mTime) { $time_end = time(); $time_alignment = $time_end - $time_start; }
 231
 232     $inpF = "$imdName.bam";
 233     $is_bam = 1; # alignments are outputed as a BAM file
 234 }
 235
 236 if ($mTime) { $time_start = time(); }
 237
 238 $command = $dir."rsem-parse-alignments $refName $imdName $statName";
 239
 240 my $samInpType;
 241 if ($is_sam) { $samInpType = "s"; }
 242 elsif ($is_bam) { $samInpType = "b"; }
 243
 244 $command .= " $samInpType $inpF -t $read_type";
 245 if ($fn_list ne "") { $command .= " -l $fn_list"; }
 246 if ($tagName ne "") { $command .= " -tag $tagName"; }
 247 if ($quiet) { $command .= " -q"; }
 248
 249 &runCommand($command);
 250
 251 $command = $dir."rsem-build-read-index $gap";
 252 if ($read_type == 0) { $command .= " 0 $quiet $imdName\_alignable.fa"; }
 253 elsif ($read_type == 1) { $command .= " 1 $quiet $imdName\_alignable.fq"; }
 254 elsif ($read_type == 2) { $command .= " 0 $quiet $imdName\_alignable_1.fa $imdName\_alignable_2.fa"; }
 255 elsif ($read_type == 3) { $command .= " 1 $quiet $imdName\_alignable_1.fq $imdName\_alignable_2.fq"; }
 256 else { print "Impossible! read_type is not in [1,2,3,4]!\n"; exit(-1); }
 257 &runCommand($command);
 258
 259 my $doesOpen = open(OUTPUT, ">$imdName.mparams");
 260 if ($doesOpen == 0) { print "Cannot generate $imdName.mparams!\n"; exit(-1); }
 261 print OUTPUT "$minL $maxL\n";
 262 print OUTPUT "$probF\n";
 263 print OUTPUT "$estRSPD\n";
 264 print OUTPUT "$B\n";
 265 print OUTPUT "$mate_minL $mate_maxL\n";
 266 print OUTPUT "$mean $sd\n";
 267 print OUTPUT "$L\n";
 268 close(OUTPUT);
 269
 270 $command = $dir."rsem-run-em $refName $read_type $sampleName $imdName $statName -p $nThreads";
 271 if ($genBamF) {
 272     $command .= " -b $samInpType $inpF";
 273     if ($fn_list ne "") { $command .= " 1 $fn_list"; }
 274     else { $command .= " 0"; }
 275     if ($sampling) { $command .= " --sampling"; }
 276 }
 277 if ($calcCI || $var_opt) { $command .= " --gibbs-out"; }
 278 if ($quiet) { $command .= " -q"; }
 279
 280 &runCommand($command);
 281
 282 &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
 283 &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
 284
 285 if ($genBamF) {
 286     $command = $dir."sam/samtools sort $sampleName.transcript.bam $sampleName.transcript.sorted";
 287     &runCommand($command);
 288     $command = $dir."sam/samtools index $sampleName.transcript.sorted.bam";
 289     &runCommand($command);
 290
 291     if ($genGenomeBamF) {
 292         $command = $dir."rsem-tbam2gbam $refName $sampleName.transcript.bam $sampleName.genome.bam";
 293         &runCommand($command);
 294         $command = $dir."sam/samtools sort $sampleName.genome.bam $sampleName.genome.sorted";
 295         &runCommand($command);
 296         $command = $dir."sam/samtools index $sampleName.genome.sorted.bam";
 297         &runCommand($command);
 298     }
 299 }
 300
 301 if ($mTime) { $time_end = time(); $time_rsem = $time_end - $time_start; }
 302
 303 if ($mTime) { $time_start = time(); }
 304
 305 if ($calcCI || $var_opt) {
 306     $command = $dir."rsem-run-gibbs $refName $imdName $statName $BURNIN $NCV $SAMPLEGAP";
 307     $command .= " -p $nThreads";
 308     if ($var_opt) { $command .= " --var"; }
 309     if ($quiet) { $command .= " -q"; }
 310     &runCommand($command);
 311 }
 312
 313 if ($calcCI) {
 314     system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak1");
 315     system("mv $sampleName.genes.results $imdName.genes.results.bak1");
 316     &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
 317     &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
 318
 319     $command = $dir."rsem-calculate-credibility-intervals $refName $imdName $statName $CONFIDENCE $NCV $NSPC $NMB";
 320     $command .= " -p $nThreads";
 321     if ($quiet) { $command .= " -q"; }
 322     &runCommand($command);
 323
 324     system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak2");
 325     system("mv $sampleName.genes.results $imdName.genes.results.bak2");
 326     &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
 327     &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
 328 }
 329
 330 if ($mTime) { $time_end = time(); $time_ci = $time_end - $time_start; }
 331
 332 if ($mTime) { $time_start = time(); }
 333
 334 if (!$keep_intermediate_files) {
 335     &runCommand("rm -rf $temp_dir", "Fail to delete the temporary folder!");
 336 }
 337
 338 if ($mTime) { $time_end = time(); }
 339
 340 if ($mTime) {
 341     open(OUTPUT, ">$sampleName.time");
 342     print OUTPUT "Aligning reads: $time_alignment s.\n";
 343     print OUTPUT "Estimating expression levels: $time_rsem s.\n";
 344     print OUTPUT "Calculating credibility intervals: $time_ci s.\n";
 345     my $time_del = $time_end - $time_start;
 346 #    print OUTPUT "Delete: $time_del s.\n";
 347     close(OUTPUT);
 348 }
 349
 350 # command, {err_msg}
 351 sub runCommand {
 352     print $_[0]."\n";
 353     my $status = system($_[0]);
 354     if ($status != 0) {
 355         my $errmsg = "";
 356         if (scalar(@_) > 1) { $errmsg .= $_[1]."\n"; }
 357         $errmsg .= "\"$_[0]\" failed! Please check if you provide correct parameters/options for the pipeline!\n";
 358         print $errmsg;
 359         exit(-1);
 360     }
 361     print "\n";
 362 }
 363
 364 # inpF, outF
 365 sub collectResults {
 366     my $local_status;
 367     my ($inpF, $outF);
 368     my @results = ();
 369     my $line;
 370
 371     $inpF = $_[1];
 372     $outF = $_[2];
 373
 374     $local_status = open(INPUT, $inpF);
 375     if ($local_status == 0) { print "Fail to open file $inpF!\n"; exit(-1); }
 376
 377     @results = ();
 378
 379     while ($line = <INPUT>) {
 380         chomp($line);
 381         my @local_arr = split(/\t/, $line);
 382         push(@results, \@local_arr);
 383     }
 384
 385     close(INPUT);
 386
 387     $local_status = open(OUTPUT, ">$outF");
 388     if ($local_status == 0) { print "Fail to create file $outF!\n"; exit(-1); }
 389
 390     my $n = scalar(@results);
 391     my $m = scalar(@{$results[0]});
 392
 393     $" = "\t";
 394
 395     my @out_arr = ();
 396     for (my $i = 0; $i < $n; $i++) {
 397         if ($_[0] eq "isoform") { push(@out_arr, $transcript_title[$i]); }
 398         elsif ($_[0] eq "gene") { push(@out_arr, $gene_title[$i]); }
 399         else { print "A bug on 'collectResults' is detected!\n"; exit(-1); }
 400     }
 401     print OUTPUT "@out_arr\n";
 402
 403     for (my $i = 0; $i < $m; $i++) {
 404         @out_arr = ();
 405         for (my $j = 0; $j < $n; $j++) { push(@out_arr, $results[$j][$i]); }
 406         print OUTPUT "@out_arr\n";
 407     }
 408
 409     close(OUTPUT);
 410 }
 411
 412 # dir
 413 sub showVersionInfo {
 414     open(INPUT, "$_[0]\WHAT_IS_NEW");
 415     my $line = <INPUT>;
 416     chomp($line);
 417     close(INPUT);
 418     print "$line\n";
 419     exit(0);
 420 }
 421
 422 __END__
 423
 424 =head1 NAME
 425
 426 rsem-calculate-expression
 427
 428 =head1 SYNOPSIS
 429
 430 =over
 431
 432  rsem-calculate-expression [options] upstream_read_file(s) reference_name sample_name
 433  rsem-calculate-expression [options] --paired-end upstream_read_file(s) downstream_read_file(s) reference_name sample_name
 434  rsem-calculate-expression [options] --sam/--bam [--paired-end] input reference_name sample_name
 435
 436 =back
 437
 438 =head1 ARGUMENTS
 439
 440 =over
 441
 442 =item B<upstream_read_files(s)>
 443
 444 Comma-separated list of files containing single-end reads or upstream reads for paired-end data.  By default, these files are assumed to be in FASTQ format.  If the --no-qualities option is specified, then FASTA format is expected.
 445
 446 =item B<downstream_read_file(s)>
 447
 448 Comma-separated list of files containing downstream reads which are paired with the upstream reads.  By default, these files are assumed to be in FASTQ format.  If the --no-qualities option is specified, then FASTA format is expected.
 449
 450 =item B<input>
 451
 452 SAM/BAM formatted input file.  If "-" is specified for the filename, SAM/BAM input is instead assumed to come from standard input. RSEM requires all alignments of the same read group together. For paired-end reads, RSEM also requires the two mates of any alignment be adjacent. See Description section for how to make input file obey RSEM's requirements.
 453
 454 =item B<reference_name>
 455
 456 The name of the reference used.  The user must have run 'rsem-prepare-reference' with this reference_name before running this program.
 457
 458 =item B<sample_name>
 459
 460 The name of the sample analyzed. All output files are prefixed by this name (e.g., sample_name.genes.results)
 461
 462 =back
 463
 464 =head1 OPTIONS
 465
 466 =over
 467
 468 =item B<--paired-end>
 469
 470 Input reads are paired-end reads. (Default: off)
 471
 472 =item B<--no-qualities>
 473
 474 Input reads do not contain quality scores. (Default: off)
 475
 476 =item B<--strand-specific>
 477
 478 The RNA-Seq protocol used to generate the reads is strand specific, i.e., all (upstream) reads are derived from the forward strand.  This option is equivalent to --forward-prob=1.0.  With this option set, if RSEM runs the Bowtie aligner, the '--norc' Bowtie option will be used, which disables alignment to the reverse strand of transcripts.  (Default: off)
 479
 480 =item B<--sam>
 481
 482 Input file is in SAM format. (Default: off)
 483
 484 =item B<--bam>
 485
 486 Input file is in BAM format. (Default: off)
 487
 488 =item B<--sam-header-info> <file>
 489
 490 RSEM reads header information from input by default. If this option is on, header information is read from the specified file. For the format of the file, please see SAM official website. (Default: "")
 491
 492 =item B<-p/--num-threads> <int>
 493
 494 Number of threads to use. Both Bowtie and expression estimation will use this many threads. (Default: 1)
 495
 496 =item B<--no-bam-output>
 497
 498 Do not output any BAM file. (Default: off)
 499
 500 =item B<--output-genome-bam>
 501
 502 Generate a BAM file, 'sample_name.genome.bam', with alignments mapped to genomic coordinates and annotated with their posterior probabilities. In addition, RSEM will call samtools (included in RSEM package) to sort and index the bam file. 'sample_name.genome.sorted.bam' and 'sample_name.genome.sorted.bam.bai' will be generated. (Default: off)
 503
 504 =item B<--sampling-for-bam>
 505
 506 When RSEM generates a BAM file, instead of outputing all alignments a read has with their posterior probabilities, one alignment is sampled according to the posterior probabilities. The sampling procedure includes the alignment to the "noise" transcript, which does not appear in the BAM file. Only the sampled alignment has a weight of 1. All other alignments have weight 0. If the "noise" transcript is sampled, all alignments appeared in the BAM file should have weight 0. (Default: off)
 507
 508 =item B<--calc-ci>
 509
 510 Calculate 95% credibility intervals and posterior mean estimates.  (Default: off)
 511
 512 =item B<--seed-length> <int>
 513
 514 Seed length used by the read aligner.  Providing the correct value is important for RSEM. If RSEM runs Bowtie, it uses this value for Bowtie's seed length parameter. Any read with its or at least one of its mates' (for paired-end reads) length less than this value will be ignored. If the references are not added poly(A) tails, the minimum allowed value is 5, otherwise, the minimum allowed value is 25. Note that this script will only check if the value >= 5 and give a warning message if the value < 25 but >= 5. (Default: 25)
 515
 516 =item B<--tag> <string>
 517
 518 The name of the optional field used in the SAM input for identifying a read with too many valid alignments. The field should have the format <tagName>:i:<value>, where a <value> bigger than 0 indicates a read with too many alignments. (Default: "")
 519
 520 =item B<--bowtie-path> <path>
 521
 522 The path to the bowtie executables. (Default: the path to the bowtie executables is assumed to be in the user's PATH environment variable)
 523
 524 =item B<--bowtie-n> <int>
 525
 526 (Bowtie parameter) max # of mismatches in the seed. (Range: 0-3, Default: 2)
 527
 528 =item B<--bowtie-e> <int>
 529
 530 (Bowtie parameter) max sum of mismatch quality scores across the alignment. (Default: 99999999)
 531
 532 =item B<--bowtie-m> <int>
 533
 534 (Bowtie parameter) suppress all alignments for a read if > <int> valid alignments exist. (Default: 200)
 535
 536 =item B<--bowtie-chunkmbs> <int>
 537
 538 (Bowtie parameter) memory allocated for best first alignment calculation (Default: 0 - use bowtie's default)
 539
 540 =item B<--phred33-quals>
 541
 542 Input quality scores are encoded as Phred+33. (Default: on)
 543
 544 =item B<--phred64-quals>
 545
 546 Input quality scores are encoded as Phred+64 (default for GA Pipeline ver. >= 1.3). (Default: off)
 547
 548 =item B<--solexa-quals>
 549
 550 Input quality scores are solexa encoded (from GA Pipeline ver. < 1.3). (Default: off)
 551
 552 =item B<--forward-prob> <double>
 553
 554 Probability of generating a read from the forward strand of a transcript. Set to 1 for a strand-specific protocol where all (upstream) reads are derived from the forward strand, 0 for a strand-specific protocol where all (upstream) read are derived from the reverse strand, or 0.5 for a non-strand-specific protocol. (Default: 0.5)
 555
 556 =item B<--fragment-length-min> <int>
 557
 558 Minimum read/insert length allowed. This is also the value for the bowtie -I option. (Default: 1)
 559
 560 =item B<--fragment-length-max> <int>
 561
 562 Maximum read/insert length allowed. This is also the value for the bowtie -X option. (Default: 1000)
 563
 564 =item B<--fragment-length-mean> <double>
 565
 566 (single-end data only) The mean of the fragment length distribution, which is assumed to be a Gaussian. (Default: -1, which disables use of the fragment length distribution)
 567
 568 =item B<--fragment-length-sd> <double>
 569
 570 (single-end data only) The standard deviation of the fragment length distribution, which is assumed to be a Gaussian.  (Default: 0, which assumes that all fragments are of the same length, given by the rounded value of B<--fragment-length-mean>)
 571
 572 =item B<--estimate-rspd>
 573
 574 Set this option if you want to estimate the read start position distribution (RSPD) from data. Otherwise, RSEM will use a uniform RSPD. (Default: off)
 575
 576 =item B<--num-rspd-bins> <int>
 577
 578 Number of bins in the RSPD. Only relevant when '--estimate-rspd' is specified.  Use of the default setting is recommended. (Default: 20)
 579
 580 =item B<--ci-memory> <int>
 581
 582 Maximum size (in memory, MB) of the auxiliary buffer used for computing credibility intervals (CI). Set it larger for a faster CI calculation. However, leaving 2 GB memory free for other usage is recommended. (Default: 1024)
 583
 584 =item B<--keep-intermediate-files>
 585
 586 Keep temporary files generated by RSEM.  RSEM creates a temporary directory, 'sample_name.temp', into which it puts all intermediate output files. If this directory already exists, RSEM overwrites all files generated by previous RSEM runs inside of it. By default, after RSEM finishes, the temporary directory is deleted.  Set this option to prevent the deletion of this directory and the intermediate files inside of it. (Default: off)
 587
 588 =item B<--temporary-folder> <string>
 589
 590 Set where to put the temporary files generated by RSEM. If the folder specified does not exist, RSEM will try to create it. (Default: sample_name.temp)
 591
 592 =item B<--time>
 593
 594 Output time consumed by each step of RSEM to 'sample_name.time'. (Default: off)
 595
 596 =item B<-q/--quiet>
 597
 598 Suppress the output of logging information. (Default: off)
 599
 600 =item B<-h/--help>
 601
 602 Show help information.
 603
 604 =back
 605
 606 =head1 DESCRIPTION
 607
 608 In its default mode, this program aligns input reads against a reference transcriptome with Bowtie and calculates expression values using the alignments.  RSEM assumes the data are single-end reads with quality scores, unless the '--paired-end' or '--no-qualities' options are specified.  Users may use an alternative aligner by specifying one of the --sam and --bam options, and providing an alignment file in the specified format. However, users should make sure that they align against the indices generated by 'rsem-prepare-reference' and the alignment file satisfies the requirements mentioned in ARGUMENTS section.
 609
 610 One simple way to make the alignment file satisfying RSEM's requirements (assuming the aligner used put mates in a paired-end read adjacent) is to use 'convert-sam-for-rsem' script. This script only accept SAM format files as input. If a BAM format file is obtained, please use samtools to convert it to a SAM file first. For example, if '/ref/mouse_125' is the 'reference_name' and the SAM file is named 'input.sam', you can run the following command:
 611
 612   convert-sam-for-rsem /ref/mouse_125 input.sam -o input_for_rsem.sam
 613
 614 For details, please refer to 'convert-sam-for-rsem's documentation page.
 615
 616 The SAM/BAM format RSEM uses is v1.4. However, it is compatible with old SAM/BAM format. However, RSEM cannot recognize 0x100 in the FLAG field. In addition, RSEM requires SEQ and QUAL are not '*'.
 617
 618 The user must run 'rsem-prepare-reference' with the appropriate reference before using this program.
 619
 620 For single-end data, it is strongly recommended that the user provide the fragment length distribution parameters (--fragment-length-mean and --fragment-length-sd).  For paired-end data, RSEM will automatically learn a fragment length distribution from the data.
 621
 622 Please note that some of the default values for the Bowtie parameters are not the same as those defined for Bowtie itself.
 623
 624 The temporary directory and all intermediate files will be removed when RSEM finishes unless '--keep-intermediate-files' is specified.
 625
 626 With the '--calc-ci' option, 95% credibility intervals and posterior mean estimates will be calculated in addition to maximum likelihood estimates.
 627
 628 =head1 OUTPUT
 629
 630 =over
 631
 632 =item B<sample_name.isoforms.results>
 633
 634 File containing isoform level expression estimates. The first line
 635 contains column names separated by the tab character. The format of
 636 each line in the rest of this file is:
 637
 638 transcript_id gene_id length effective_length expected_count TPM FPKM IsoPct [pme_expected_count pme_TPM pme_FPKM IsoPct_from_pme_TPM TPM_ci_lower_bound TPM_ci_upper_bound FPKM_ci_lower_bound FPKM_ci_upper_bound]
 639
 640 Fields are separated by the tab character. Fields within "[]" are only
 641 presented if '--calc-ci' is set.
 642
 643 'transcript_id' is the transcript name of this transcript. 'gene_id'
 644 is the gene name of the gene which this transcript belongs to (denote
 645 this gene as its parent gene). If no gene information is provided,
 646 'gene_id' and 'transcript_id' are the same.
 647
 648 'length' is this transcript's sequence length (poly(A) tail is not
 649 counted). 'effective_length' counts only the positions that can
 650 generate a valid fragment. If no poly(A) tail is added,
 651 'effective_length' is equal to transcript length - mean fragment
 652 length + 1. If one transcript's effective length is less than 1, this
 653 transcript's both effective length and abundance estimates are set to
 654 0.
 655
 656 'expected_count' is the sum of the posterior probability of each read
 657 comes from this transcript over all reads. Because 1) each read
 658 aligning to this transcript has a probability of being generated from
 659 background noise; 2) RSEM may filter some alignable low quality reads,
 660 the sum of expected counts for all transcript are generally less than
 661 the total number of reads aligned.
 662
 663 'TPM' stands for Transcripts Per Million. It is a relative measure of
 664 transcript abundance. The sum of all transcripts' TPM is 1
 665 million. 'FPKM' stands for Fragments Per Kilobase of transcript per
 666 Million mapped reads. It is another relative measure of transcript
 667 abundance. If we define l_bar be the mean transcript length in a
 668 sample, which can be calculated as
 669
 670 l_bar = \sum_i TPM_i / 10^6 * effective_length_i (i goes through every transcript),
 671
 672 the following equation is hold:
 673
 674 FPKM_i = 10^3 / l_bar * TPM_i.
 675
 676 We can see that the sum of FPKM is not a constant across samples.
 677
 678 'IsoPct' stands for isoform percentage. It is the percentage of this
 679 transcript's abandunce over its parent gene's abandunce. If its parent
 680 gene has only one isoform or the gene information is not provided,
 681 this field will be set to 100.
 682
 683 'pme_expected_count', 'pme_TPM', 'pme_FPKM' are posterior mean
 684 estimates calculated by RSEM's Gibbs sampler. 'IsoPct_from_pme_TPM' is
 685 the isoform percentage calculated from 'pme_TPM' values.
 686
 687 'TPM_ci_lower_bound', 'TPM_ci_upper_bound', 'FPKM_ci_lower_bound' and
 688 'FPKM_ci_upper_bound' are lower(l) and upper(u) bounds of 95%
 689 credibility intervals for TPM and FPKM values. The bounds are
 690 inclusive (i.e. [l, u]).
 691
 692 =item B<sample_name.genes.results>
 693
 694 File containing gene level expression estimates. The first line
 695 contains column names separated by the tab character. The format of
 696 each line in the rest of this file is:
 697
 698 gene_id transcript_id(s) length effective_length expected_count TPM FPKM [pme_expected_count pme_TPM pme_FPKM TPM_ci_lower_bound TPM_ci_upper_bound FPKM_ci_lower_bound FPKM_ci_upper_bound]
 699
 700 Fields are separated by the tab character. Fields within "[]" are only
 701 presented if '--calc-ci' is set.
 702
 703 'transcript_id(s)' is a comma-separated list of transcript_ids
 704 belonging to this gene. If no gene information is provided, 'gene_id'
 705 and 'transcript_id(s)' are identical (the 'transcript_id').
 706
 707 A gene's 'length' and 'effective_length' are
 708 defined as the weighted average of its transcripts' lengths and
 709 effective lengths (weighted by 'IsoPct'). A gene's abundance estimates
 710 are just the sum of its transcripts' abundance estimates.
 711
 712 =item B<sample_name.transcript.bam, sample_name.transcript.sorted.bam and sample_name.transcript.sorted.bam.bai>
 713
 714 Only generated when --no-bam-output is not specified.
 715
 716 'sample_name.transcript.bam' is a BAM-formatted file of read
 717 alignments in transcript coordinates. The MAPQ field of each alignment
 718 is set to min(100, floor(-10 * log10(1.0 - w) + 0.5)), where w is the
 719 posterior probability of that alignment being the true mapping of a
 720 read.  In addition, RSEM pads a new tag ZW:f:value, where value is a
 721 single precision floating number representing the posterior
 722 probability. Because this file contains all alignment lines produced
 723 by bowtie or user-specified aligners, it can also be used as a
 724 replacement of the aligner generated BAM/SAM file. For paired-end
 725 reads, if one mate has alignments but the other does not, this file
 726 marks the alignable mate as "unmappable" (flag bit 0x4) and appends an
 727 optional field "Z0:A:!".
 728
 729 'sample_name.transcript.sorted.bam' and
 730 'sample_name.transcript.sorted.bam.bai' are the sorted BAM file and
 731 indices generated by samtools (included in RSEM package).
 732
 733 =item B<sample_name.genome.bam, sample_name.genome.sorted.bam and sample_name.genome.sorted.bam.bai>
 734
 735 Only generated when --no-bam-output is not specified and --output-genome-bam is specified.
 736
 737 'sample_name.genome.bam' is a BAM-formatted file of read alignments in
 738 genomic coordinates. Alignments of reads that have identical genomic
 739 coordinates (i.e., alignments to different isoforms that share the
 740 same genomic region) are collapsed into one alignment.  The MAPQ field
 741 of each alignment is set to min(100, floor(-10 * log10(1.0 - w) +
 742 0.5)), where w is the posterior probability of that alignment being
 743 the true mapping of a read.  In addition, RSEM pads a new tag
 744 ZW:f:value, where value is a single precision floating number
 745 representing the posterior probability. If an alignment is spliced, a
 746 XS:A:value tag is also added, where value is either '+' or '-'
 747 indicating the strand of the transcript it aligns to.
 748
 749 'sample_name.genome.sorted.bam' and 'sample_name.genome.sorted.bam.bai' are the
 750 sorted BAM file and indices generated by samtools (included in RSEM package).
 751
 752 =item B<sample_name.time>
 753
 754 Only generated when --time is specified.
 755
 756 It contains time (in seconds) consumed by aligning reads, estimating expression levels and calculating credibility intervals.
 757
 758 =item B<sample_name.stat>
 759
 760 This is a folder instead of a file. All model related statistics are stored in this folder. Use 'rsem-plot-model' can generate plots using this folder.
 761
 762 =back
 763
 764 =head1 EXAMPLES
 765
 766 Assume the path to the bowtie executables is in the user's PATH environment variable. Reference files are under '/ref' with name 'mouse_125'.
 767
 768 1) '/data/mmliver.fq', single-end reads with quality scores. Quality scores are encoded as for 'GA pipeline version >= 1.3'. We want to use 8 threads and generate a genome BAM file:
 769
 770  rsem-calculate-expression --phred64-quals \
 771                            -p 8 \
 772                            --output-genome-bam \
 773                            /data/mmliver.fq \
 774                            /ref/mouse_125 \
 775                            mmliver_single_quals
 776
 777 2) '/data/mmliver_1.fq' and '/data/mmliver_2.fq', paired-end reads with quality scores. Quality scores are in SANGER format. We want to use 8 threads and do not generate a genome BAM file:
 778
 779  rsem-calculate-expression -p 8 \
 780                            --paired-end \
 781                            /data/mmliver_1.fq \
 782                            /data/mmliver_2.fq \
 783                            /ref/mouse_125 \
 784                            mmliver_paired_end_quals
 785
 786 3) '/data/mmliver.fa', single-end reads without quality scores. We want to use 8 threads:
 787
 788  rsem-calculate-expression -p 8 \
 789                            --no-qualities \
 790                            /data/mmliver.fa \
 791                            /ref/mouse_125 \
 792                            mmliver_single_without_quals
 793
 794 4) Data are the same as 1). We want to take a fragment length distribution into consideration. We set the fragment length mean to 150 and the standard deviation to 35. In addition to a BAM file, we also want to generate credibility intervals.  We allow RSEM to use 1GB of memory for CI calculation:
 795
 796  rsem-calculate-expression --bowtie-path /sw/bowtie \
 797                            --phred64-quals \
 798                            --fragment-length-mean 150.0 \
 799                            --fragment-length-sd 35.0 \
 800                            -p 8 \
 801                            --output-genome-bam \
 802                            --calc-ci \
 803                            --ci-memory 1024 \
 804                            /data/mmliver.fq \
 805                            /ref/mouse_125 \
 806                            mmliver_single_quals
 807
 808 5) '/data/mmliver_paired_end_quals.bam', paired-end reads with quality scores.  We want to use 8 threads:
 809
 810  rsem-calculate-expression --paired-end \
 811                            --bam \
 812                            -p 8 \
 813                            /data/mmliver_paired_end_quals.bam \
 814                            /ref/mouse_125 \
 815                            mmliver_paired_end_quals
 816
 817 =cut