From 027875d072147a81c02a532d35b6d0450294a87a Mon Sep 17 00:00:00 2001 From: Bo Li Date: Fri, 9 Dec 2011 17:53:11 -0600 Subject: [PATCH] add a script to convert sam file for non-bowtie aligner --- convert_sam_for_rsem | 180 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 180 insertions(+) create mode 100755 convert_sam_for_rsem diff --git a/convert_sam_for_rsem b/convert_sam_for_rsem new file mode 100755 index 0000000..9bdd033 --- /dev/null +++ b/convert_sam_for_rsem @@ -0,0 +1,180 @@ +#!/usr/bin/perl + +use Getopt::Long; +use Pod::Usage; +use strict; + +my $standard_output = "&STDOUT"; + +my $out_file = $standard_output; +my @tmp_dirs = (); +my $help = 0; + +GetOptions("o=s" => \$out_file, + "T|temporary-directory=s" => \@tmp_dirs, + "h|help" => \$help) or pd2usage(-exitval => 2, -verbose => 2); + + +pod2usage(-verbose => 2) if ($help == 1); +pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 2); + +my $command; +my (@fields, @header) = (); +my $M; + +# Load fields +my @lines = (); +my $type; + +open(INPUT, "$ARGV[0].ti"); +@lines = ; +chomp(@lines); +close(INPUT); + +@fields = (); +($M, $type) = split(/ /, $lines[0]); +for (my $i = 0; $i < $M; $i++) { + push(@fields, "SN:$lines[$i * 6 + 1]"); +} + + +# Reorder header +my $line; + +open(INPUT, $ARGV[1]); +@header = (); +while (($line = ) && substr($line, 0, 1) eq '@') { + chomp($line); + push(@header, $line); +} +close(INPUT); + +my $n = scalar(@header); +if ($n > 0) { + my %hash = (); + my @ktable = (); + + my $tid = 0; + + for (my $i = 0; $i < $n; $i++) { + my @arr = split(/\t/, $header[$i]); + if ($arr[0] ne "\@SQ") { push(@ktable, ""); next; } + my $hasSN = 0; + foreach my $key (@arr) { + if (substr($key, 0, 3) eq "SN:") { + $hash{$key} = $i; + $hasSN = 1; + last; + } + } + if (!$hasSN) { print STDERR "\"$header[$i]\" does not have a SN tag!\n"; exit(-1); } + push(@ktable, $fields[$tid++]); + } + + if ($tid != $M) { print STDERR "Number of \@SQ lines is not correct!\n"; exit(-1); } + + open(OUTPUT, ">$out_file"); + for (my $i = 0; $i < $n; $i++) { + if ($ktable[$i] eq "") { print OUTPUT $header[$i]; } + else { print OUTPUT $header[$hash{$ktable[$i]}]; } + print OUTPUT "\n"; + } + close(OUTPUT); +} + + +# extract alignment section +$command = "grep ^[^@] $ARGV[1] > $ARGV[1].__temp"; +&runCommand($command); + +# sort and output the alignment section +$command = "sort -k 1,1 -s"; +if (scalar(@tmp_dirs) > 0) { $" = " -T "; $command .= " -T @tmp_dirs"; } +$command .= " $ARGV[1].__temp"; +if ($out_file ne $standard_output) { $command .= " >> $out_file"; } +&runCommand($command); + +# delete temporary files +$command = "rm -f $ARGV[1].__temp"; +&runCommand($command); + +# finish +print STDERR "Conversion is completed successfully!\n"; + +# command, {err_msg} +sub runCommand { + print STDERR $_[0]."\n"; + my $status = system($_[0]); + if ($status != 0) { + my $errmsg; + if (scalar(@_) > 1) { $errmsg = $_[1]; } + else { $errmsg = "\"$command\" failed! Plase check if you provide correct parameters/options for the script!"; } + print STDERR $errmsg."\n"; + exit(-1); + } + print STDERR "\n"; +} + +__END__ + +=head1 NAME + +convert_sam_for_rsem + +=head1 SYNOPSIS + +=over + + convert_sam_for_rsem [options] reference_name input_sam + +=back + +=head1 ARGUMENTS + +=over + +=item B + +The name of the reference used. This should be the same name used by 'rsem-prepare-reference'. + +=item B + +The SAM file (*.sam) generated by user's aligner. If the aligner produces a BAM file, please use samtools to convert it to a SAM file (with header information). + +=back + +=head1 OPTIONS + +=over + +=item B<-o> + +Output the converted SAM file into . (Default: STDOUT) + +=item B<-T/--temporary-directory> + +'convert_sam_for_rsem' will call 'sort' command and this is the '-T/--temporary-directory' option of 'sort' command. The following is the description from 'sort' : "use DIR for temporaries, not $TMPDIR or /tmp; multiple options specify multiple directories". + +=item B<-h/--help> + +Show help information. + +=back + +=head1 DESCRIPTION + +This program converts the SAM file generated by user's aligner into a SAM file which RSEM can process. However, users should make sure their aligners use 'reference_name.idx.fa' generated by 'rsem-prepare-reference' as their references. In addition, their aligners should output header information and make two mates of the same alignment adjacent to each other for paired-end data. This program will output the converted file into standard output by default for the purpose of piping. By setting '-o' option, users can make the converted file written into disk. + +=head1 EXAMPLES + +Suppose reference_name and input_sam are set to '/ref/mouse_125' and 'input.sam'. + +1) Output to standard output and gzip the output to 'input_for_rsem.sam.gz': + + convert_sam_for_rsem /ref/mouse_125 input.sam | gzip > input_for_rsem.sam.gz + +2) Output to 'input_for_rsem.sam' directly: + + convert_sam_for_rsem /ref/mouse_125 input.sam -o input_for_rsem.sam + +=cut -- 2.39.2