#!/usr/bin/perl
use Getopt::Long;
use Pod::Usage;
use strict;
my $standard_output = "&STDOUT";
my $out_file = $standard_output;
my @tmp_dirs = ();
my $help = 0;
GetOptions("o=s" => \$out_file,
"T|temporary-directory=s" => \@tmp_dirs,
"h|help" => \$help) or pd2usage(-exitval => 2, -verbose => 2);
pod2usage(-verbose => 2) if ($help == 1);
pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 2);
my $command;
my (@fields, @header) = ();
my $M;
# Load fields
my @lines = ();
my $type;
open(INPUT, "$ARGV[0].ti");
@lines = ;
chomp(@lines);
close(INPUT);
@fields = ();
($M, $type) = split(/ /, $lines[0]);
for (my $i = 0; $i < $M; $i++) {
push(@fields, "SN:$lines[$i * 6 + 1]");
}
# Reorder header
my $line;
open(INPUT, $ARGV[1]);
@header = ();
while (($line = ) && substr($line, 0, 1) eq '@') {
chomp($line);
push(@header, $line);
}
close(INPUT);
my $n = scalar(@header);
if ($n > 0) {
my %hash = ();
my @ktable = ();
my $tid = 0;
for (my $i = 0; $i < $n; $i++) {
my @arr = split(/\t/, $header[$i]);
if ($arr[0] ne "\@SQ") { push(@ktable, ""); next; }
my $hasSN = 0;
foreach my $key (@arr) {
if (substr($key, 0, 3) eq "SN:") {
$hash{$key} = $i;
$hasSN = 1;
last;
}
}
if (!$hasSN) { print STDERR "\"$header[$i]\" does not have a SN tag!\n"; exit(-1); }
push(@ktable, $fields[$tid++]);
}
if ($tid != $M) { print STDERR "Number of \@SQ lines is not correct!\n"; exit(-1); }
open(OUTPUT, ">$out_file");
for (my $i = 0; $i < $n; $i++) {
if ($ktable[$i] eq "") { print OUTPUT $header[$i]; }
else { print OUTPUT $header[$hash{$ktable[$i]}]; }
print OUTPUT "\n";
}
close(OUTPUT);
}
# extract alignment section
$command = "grep ^[^@] $ARGV[1] > $ARGV[1].__temp";
&runCommand($command);
# sort and output the alignment section
$command = "sort -k 1,1 -s";
if (scalar(@tmp_dirs) > 0) { $" = " -T "; $command .= " -T @tmp_dirs"; }
$command .= " $ARGV[1].__temp";
if ($out_file ne $standard_output) { $command .= " >> $out_file"; }
&runCommand($command);
# delete temporary files
$command = "rm -f $ARGV[1].__temp";
&runCommand($command);
# finish
print STDERR "Conversion is completed successfully!\n";
# command, {err_msg}
sub runCommand {
print STDERR $_[0]."\n";
my $status = system($_[0]);
if ($status != 0) {
my $errmsg;
if (scalar(@_) > 1) { $errmsg = $_[1]; }
else { $errmsg = "\"$command\" failed! Plase check if you provide correct parameters/options for the script!"; }
print STDERR $errmsg."\n";
exit(-1);
}
print STDERR "\n";
}
__END__
=head1 NAME
convert_sam_for_rsem
=head1 SYNOPSIS
=over
convert_sam_for_rsem [options] reference_name input_sam
=back
=head1 ARGUMENTS
=over
=item B
The name of the reference used. This should be the same name used by 'rsem-prepare-reference'.
=item B
The SAM file (*.sam) generated by user's aligner. If the aligner produces a BAM file, please use samtools to convert it to a SAM file (with header information).
=back
=head1 OPTIONS
=over
=item B<-o>
Output the converted SAM file into . (Default: STDOUT)
=item B<-T/--temporary-directory>
'convert_sam_for_rsem' will call 'sort' command and this is the '-T/--temporary-directory' option of 'sort' command. The following is the description from 'sort' : "use DIR for temporaries, not $TMPDIR or /tmp; multiple options specify multiple directories".
=item B<-h/--help>
Show help information.
=back
=head1 DESCRIPTION
This program converts the SAM file generated by user's aligner into a SAM file which RSEM can process. However, users should make sure their aligners use 'reference_name.idx.fa' generated by 'rsem-prepare-reference' as their references. In addition, their aligners should output header information and make two mates of the same alignment adjacent to each other for paired-end data. This program will output the converted file into standard output by default for the purpose of piping. By setting '-o' option, users can make the converted file written into disk.
Note: You do not need to run this script if Bowtie (not Bowtie 2) is used.
=head1 EXAMPLES
Suppose reference_name and input_sam are set to '/ref/mouse_125' and 'input.sam'.
1) Output to standard output and gzip the output to 'input_for_rsem.sam.gz':
convert_sam_for_rsem /ref/mouse_125 input.sam | gzip > input_for_rsem.sam.gz
2) Output to 'input_for_rsem.sam' directly:
convert_sam_for_rsem /ref/mouse_125 input.sam -o input_for_rsem.sam
=cut