#!/usr/bin/env perl use Getopt::Long; use Pod::Usage; use File::Basename; use File::Path 'rmtree'; use FindBin; use lib $FindBin::RealBin; use rsem_perl_utils; use Env qw(@PATH); @PATH = ($FindBin::RealBin, "$FindBin::RealBin/sam", @PATH); use strict; my ($in_file, $out_file) = (); my @tmp_dirs = (); my $help = 0; GetOptions("T|temporary-directory=s" => \@tmp_dirs, "h|help" => \$help) or pd2usage(-exitval => 2, -verbose => 2); pod2usage(-verbose => 2) if ($help == 1); pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 2); my $command; $in_file = $ARGV[0]; $out_file = "$ARGV[1].bam"; my ($fn, $dir, $suf) = fileparse($in_file, qr/\.[^.]*/); $suf = lc(substr($suf, 1)); pod2usage(-msg => "Input file's suffix is neither sam nor bam!", -exitval => 2, -verbose => 2) if (($suf ne "sam") && ($suf ne "bam")); my $isSam = ($suf eq "sam"); my $temp_dir = "$out_file.temp"; if (-d $temp_dir) { print "Warning: $temp_dir exists, convert-sam-for-rsem will write temporary files into this folder and delete it after it finishes!\n"; } else { if (!mkdir($temp_dir)) { print "Fail to create folder $temp_dir.\n"; exit(-1); } } # Convert to SAM format if input is a BAM file my $sam_file; if (!$isSam) { $sam_file = "$temp_dir/input.sam"; $command = "samtools view -h -o $sam_file $in_file"; &runCommand($command); } else { $sam_file = $in_file; } # Phase I, sort entries so that all entries of a same read groups together my $tmp_sam = "$temp_dir/tmp.sam"; # grep header section $command = "grep ^@ $sam_file > $tmp_sam"; &runCommand($command); # sort alignment section $command = "grep ^[^@] $sam_file | sort -k 1,1 -s"; if (scalar(@tmp_dirs) > 0) { $" = " -T "; $command .= " -T @tmp_dirs"; } $command .= " >> $tmp_sam"; &runCommand($command); # Phase II, parse the temporary SAM file to make paired-end alignments' two mates adjacent to each other $command = "rsem-scan-for-paired-end-reads $tmp_sam $out_file"; &runCommand($command); # delete temporary directory rmtree($temp_dir); print STDERR "Conversion is completed. $out_file will be checked by 'rsem-sam-validator'.\n"; # Phase III, validate if the resulting bam file is correct $command = "rsem-sam-validator $out_file"; &runCommand($command); __END__ =head1 NAME convert-sam-for-rsem =head1 SYNOPSIS convert-sam-for-rsem [options] output_file_name =head1 ARGUMENTS =over =item B The SAM or BAM file generated by user's aligner. We require this file contains the header section. If input is a SAM file, it must end with suffix 'sam' (case insensitive). If input is a BAM file, it must end with suffix 'bam' (case insensitive). =item B The output name for the converted file. 'convert-sam-for-rsem' will output a BAM with the name 'output_file_name.bam'. =back =head1 OPTIONS =over =item B<-T/--temporary-directory> 'convert-sam-for-rsem' will call 'sort' command and this is the '-T/--temporary-directory' option of 'sort' command. The following is the description from 'sort' : "use DIR for temporaries, not $TMPDIR or /tmp; multiple options specify multiple directories". =item B<-h/--help> Show help information. =back =head1 DESCRIPTION This program converts the SAM/BAM file generated by user's aligner into a BAM file which RSEM can process. However, users should make sure their aligners use 'reference_name.idx.fa' generated by 'rsem-prepare-reference' as their references and output header sections. This program will create a temporary directory called 'output_file_name.bam.temp' to store the intermediate files. The directory will be deleted automatically after the conversion. After the conversion, this program will call 'rsem-sam-validator' to validate the resulting BAM file. Note: You do not need to run this script if `rsem-sam-validator' reports that your SAM/BAM file is valid. Note: This program does not check the correctness of input file. You should make sure the input is a valid SAM/BAM format file. =head1 EXAMPLES Suppose input is set to 'input.sam' and output file name is "output" convert-sam-for-rsem input.sam output We will get a file called 'output.bam' as output. =cut