#!/usr/bin/perl use Getopt::Long; use Pod::Usage; use strict; my $standard_output = "&STDOUT"; my $out_file = $standard_output; my @tmp_dirs = (); my $help = 0; GetOptions("o=s" => \$out_file, "T|temporary-directory=s" => \@tmp_dirs, "h|help" => \$help) or pd2usage(-exitval => 2, -verbose => 2); pod2usage(-verbose => 2) if ($help == 1); pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 2); my $command; my (@fields, @header) = (); my $M; # Load fields my @lines = (); my $type; open(INPUT, "$ARGV[0].ti"); @lines = ; chomp(@lines); close(INPUT); @fields = (); ($M, $type) = split(/ /, $lines[0]); for (my $i = 0; $i < $M; $i++) { push(@fields, "SN:$lines[$i * 6 + 1]"); } # Reorder header my $line; open(INPUT, $ARGV[1]); @header = (); while (($line = ) && substr($line, 0, 1) eq '@') { chomp($line); push(@header, $line); } close(INPUT); my $n = scalar(@header); if ($n > 0) { my %hash = (); my @ktable = (); my $tid = 0; for (my $i = 0; $i < $n; $i++) { my @arr = split(/\t/, $header[$i]); if ($arr[0] ne "\@SQ") { push(@ktable, ""); next; } my $hasSN = 0; foreach my $key (@arr) { if (substr($key, 0, 3) eq "SN:") { $hash{$key} = $i; $hasSN = 1; last; } } if (!$hasSN) { print STDERR "\"$header[$i]\" does not have a SN tag!\n"; exit(-1); } push(@ktable, $fields[$tid++]); } if ($tid != $M) { print STDERR "Number of \@SQ lines is not correct!\n"; exit(-1); } open(OUTPUT, ">$out_file"); for (my $i = 0; $i < $n; $i++) { if ($ktable[$i] eq "") { print OUTPUT $header[$i]; } else { print OUTPUT $header[$hash{$ktable[$i]}]; } print OUTPUT "\n"; } close(OUTPUT); } # extract alignment section $command = "grep ^[^@] $ARGV[1] > $ARGV[1].__temp"; &runCommand($command); # sort and output the alignment section $command = "sort -k 1,1 -s"; if (scalar(@tmp_dirs) > 0) { $" = " -T "; $command .= " -T @tmp_dirs"; } $command .= " $ARGV[1].__temp"; if ($out_file ne $standard_output) { $command .= " >> $out_file"; } &runCommand($command); # delete temporary files $command = "rm -f $ARGV[1].__temp"; &runCommand($command); # finish print STDERR "Conversion is completed successfully!\n"; # command, {err_msg} sub runCommand { print STDERR $_[0]."\n"; my $status = system($_[0]); if ($status != 0) { my $errmsg; if (scalar(@_) > 1) { $errmsg = $_[1]; } else { $errmsg = "\"$command\" failed! Plase check if you provide correct parameters/options for the script!"; } print STDERR $errmsg."\n"; exit(-1); } print STDERR "\n"; } __END__ =head1 NAME convert-sam-for-rsem =head1 SYNOPSIS =over convert-sam-for-rsem [options] reference_name input_sam =back =head1 ARGUMENTS =over =item B The name of the reference used. This should be the same name used by 'rsem-prepare-reference'. =item B The SAM file (*.sam) generated by user's aligner. If the aligner produces a BAM file, please use samtools to convert it to a SAM file (with header information). =back =head1 OPTIONS =over =item B<-o> Output the converted SAM file into . (Default: STDOUT) =item B<-T/--temporary-directory> 'convert-sam-for-rsem' will call 'sort' command and this is the '-T/--temporary-directory' option of 'sort' command. The following is the description from 'sort' : "use DIR for temporaries, not $TMPDIR or /tmp; multiple options specify multiple directories". =item B<-h/--help> Show help information. =back =head1 DESCRIPTION This program converts the SAM file generated by user's aligner into a SAM file which RSEM can process. However, users should make sure their aligners use 'reference_name.idx.fa' generated by 'rsem-prepare-reference' as their references. In addition, their aligners should output header information and make two mates of the same alignment adjacent to each other for paired-end data. This program will output the converted file into standard output by default for the purpose of piping. By setting '-o' option, users can make the converted file written into disk. Note: You do not need to run this script if Bowtie (not Bowtie 2) is used, or the order of @SQ lines is the same as 'reference_name.idx.fa' and the alignment lines of a same read group together and the mates of the same alignment are adjacent each other for paired-end reads. Note: This program can only recognize SAM files. See ARGUMENTS section. =head1 EXAMPLES Suppose reference_name and input_sam are set to '/ref/mouse_125' and 'input.sam'. 1) Output to standard output and gzip the output to 'input_for_rsem.sam.gz': convert-sam-for-rsem /ref/mouse_125 input.sam | gzip > input_for_rsem.sam.gz 2) Output to 'input_for_rsem.sam' directly: convert-sam-for-rsem /ref/mouse_125 input.sam -o input_for_rsem.sam =cut