X-Git-Url: https://git.donarmstrong.com/?p=rsem.git;a=blobdiff_plain;f=convert-sam-for-rsem;h=812c8b9ca6ce1e295d62752974588a419182b195;hp=68a42d7ea3cc468a9f96093da31c3d784e103695;hb=412c1a2821c5a4cbe2e68e4e9f4e2026a86d25f7;hpb=229ba9d68e0a801907631887640ab475d51c560c diff --git a/convert-sam-for-rsem b/convert-sam-for-rsem index 68a42d7..812c8b9 100755 --- a/convert-sam-for-rsem +++ b/convert-sam-for-rsem @@ -1,17 +1,25 @@ -#!/usr/bin/perl +#!/usr/bin/env perl use Getopt::Long; use Pod::Usage; +use File::Basename; +use File::Path 'rmtree'; + +use FindBin; +use lib $FindBin::RealBin; +use rsem_perl_utils; + +use Env qw(@PATH); +@PATH = ($FindBin::RealBin, "$FindBin::RealBin/sam", @PATH); + use strict; -my $standard_output = "&STDOUT"; +my ($in_file, $out_file) = (); -my $out_file = $standard_output; my @tmp_dirs = (); my $help = 0; -GetOptions("o=s" => \$out_file, - "T|temporary-directory=s" => \@tmp_dirs, +GetOptions("T|temporary-directory=s" => \@tmp_dirs, "h|help" => \$help) or pd2usage(-exitval => 2, -verbose => 2); @@ -19,101 +27,63 @@ pod2usage(-verbose => 2) if ($help == 1); pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 2); my $command; -my (@fields, @header) = (); -my $M; - -# Load fields -my @lines = (); -my $type; - -open(INPUT, "$ARGV[0].ti"); -@lines = ; -chomp(@lines); -close(INPUT); - -@fields = (); -($M, $type) = split(/ /, $lines[0]); -for (my $i = 0; $i < $M; $i++) { - push(@fields, "SN:$lines[$i * 6 + 1]"); -} +$in_file = $ARGV[0]; +$out_file = "$ARGV[1].bam"; + +my ($fn, $dir, $suf) = fileparse($in_file, qr/\.[^.]*/); + +$suf = lc(substr($suf, 1)); +pod2usage(-msg => "Input file's suffix is neither sam nor bam!", -exitval => 2, -verbose => 2) if (($suf ne "sam") && ($suf ne "bam")); +my $isSam = ($suf eq "sam"); -# Reorder header -my $line; +my $temp_dir = "$out_file.temp"; +if (-d $temp_dir) { print "Warning: $temp_dir exists, convert-sam-for-rsem will write temporary files into this folder and delete it after it finishes!\n"; } +else { + if (!mkdir($temp_dir)) { print "Fail to create folder $temp_dir.\n"; exit(-1); } +} -open(INPUT, $ARGV[1]); -@header = (); -while (($line = ) && substr($line, 0, 1) eq '@') { - chomp($line); - push(@header, $line); +# Convert to SAM format if input is a BAM file + +my $sam_file; + +if (!$isSam) { + $sam_file = "$temp_dir/input.sam"; + $command = "samtools view -h -o $sam_file $in_file"; + &runCommand($command); } -close(INPUT); - -my $n = scalar(@header); -if ($n > 0) { - my %hash = (); - my @ktable = (); - - my $tid = 0; - - for (my $i = 0; $i < $n; $i++) { - my @arr = split(/\t/, $header[$i]); - if ($arr[0] ne "\@SQ") { push(@ktable, ""); next; } - my $hasSN = 0; - foreach my $key (@arr) { - if (substr($key, 0, 3) eq "SN:") { - $hash{$key} = $i; - $hasSN = 1; - last; - } - } - if (!$hasSN) { print STDERR "\"$header[$i]\" does not have a SN tag!\n"; exit(-1); } - push(@ktable, $fields[$tid++]); - } - - if ($tid != $M) { print STDERR "Number of \@SQ lines is not correct!\n"; exit(-1); } - - open(OUTPUT, ">$out_file"); - for (my $i = 0; $i < $n; $i++) { - if ($ktable[$i] eq "") { print OUTPUT $header[$i]; } - else { print OUTPUT $header[$hash{$ktable[$i]}]; } - print OUTPUT "\n"; - } - close(OUTPUT); +else { + $sam_file = $in_file; } +# Phase I, sort entries so that all entries of a same read groups together + +my $tmp_sam = "$temp_dir/tmp.sam"; -# extract alignment section -$command = "grep ^[^@] $ARGV[1] > $ARGV[1].__temp"; +# grep header section +$command = "grep ^@ $sam_file > $tmp_sam"; &runCommand($command); -# sort and output the alignment section -$command = "sort -k 1,1 -s"; +# sort alignment section +$command = "grep ^[^@] $sam_file | sort -k 1,1 -s"; if (scalar(@tmp_dirs) > 0) { $" = " -T "; $command .= " -T @tmp_dirs"; } -$command .= " $ARGV[1].__temp"; -if ($out_file ne $standard_output) { $command .= " >> $out_file"; } +$command .= " >> $tmp_sam"; &runCommand($command); -# delete temporary files -$command = "rm -f $ARGV[1].__temp"; +# Phase II, parse the temporary SAM file to make paired-end alignments' two mates adjacent to each other + +$command = "rsem-scan-for-paired-end-reads $tmp_sam $out_file"; &runCommand($command); -# finish -print STDERR "Conversion is completed successfully!\n"; - -# command, {err_msg} -sub runCommand { - print STDERR $_[0]."\n"; - my $status = system($_[0]); - if ($status != 0) { - my $errmsg; - if (scalar(@_) > 1) { $errmsg = $_[1]; } - else { $errmsg = "\"$command\" failed! Plase check if you provide correct parameters/options for the script!"; } - print STDERR $errmsg."\n"; - exit(-1); - } - print STDERR "\n"; -} +# delete temporary directory +rmtree($temp_dir); + +print STDERR "Conversion is completed. $out_file will be checked by 'rsem-sam-validator'.\n"; + +# Phase III, validate if the resulting bam file is correct + +$command = "rsem-sam-validator $out_file"; +&runCommand($command); __END__ @@ -123,23 +93,19 @@ convert-sam-for-rsem =head1 SYNOPSIS -=over - - convert-sam-for-rsem [options] reference_name input_sam - -=back +convert-sam-for-rsem [options] output_file_name =head1 ARGUMENTS =over -=item B +=item B -The name of the reference used. This should be the same name used by 'rsem-prepare-reference'. +The SAM or BAM file generated by user's aligner. We require this file contains the header section. If input is a SAM file, it must end with suffix 'sam' (case insensitive). If input is a BAM file, it must end with suffix 'bam' (case insensitive). -=item B +=item B -The SAM file (*.sam) generated by user's aligner. If the aligner produces a BAM file, please use samtools to convert it to a SAM file (with header information). +The output name for the converted file. 'convert-sam-for-rsem' will output a BAM with the name 'output_file_name.bam'. =back @@ -147,10 +113,6 @@ The SAM file (*.sam) generated by user's aligner. If the aligner produces a BAM =over -=item B<-o> - -Output the converted SAM file into . (Default: STDOUT) - =item B<-T/--temporary-directory> 'convert-sam-for-rsem' will call 'sort' command and this is the '-T/--temporary-directory' option of 'sort' command. The following is the description from 'sort' : "use DIR for temporaries, not $TMPDIR or /tmp; multiple options specify multiple directories". @@ -163,22 +125,18 @@ Show help information. =head1 DESCRIPTION -This program converts the SAM file generated by user's aligner into a SAM file which RSEM can process. However, users should make sure their aligners use 'reference_name.idx.fa' generated by 'rsem-prepare-reference' as their references. In addition, their aligners should output header information and make two mates of the same alignment adjacent to each other for paired-end data. This program will output the converted file into standard output by default for the purpose of piping. By setting '-o' option, users can make the converted file written into disk. +This program converts the SAM/BAM file generated by user's aligner into a BAM file which RSEM can process. However, users should make sure their aligners use 'reference_name.idx.fa' generated by 'rsem-prepare-reference' as their references and output header sections. This program will create a temporary directory called 'output_file_name.bam.temp' to store the intermediate files. The directory will be deleted automatically after the conversion. After the conversion, this program will call 'rsem-sam-validator' to validate the resulting BAM file. -Note: You do not need to run this script if Bowtie (not Bowtie 2) is used, or the order of @SQ lines is the same as 'reference_name.idx.fa' and the alignment lines of a same read group together and the mates of the same alignment are adjacent each other for paired-end reads. +Note: You do not need to run this script if `rsem-sam-validator' reports that your SAM/BAM file is valid. -Note: This program can only recognize SAM files. See ARGUMENTS section. +Note: This program does not check the correctness of input file. You should make sure the input is a valid SAM/BAM format file. =head1 EXAMPLES -Suppose reference_name and input_sam are set to '/ref/mouse_125' and 'input.sam'. - -1) Output to standard output and gzip the output to 'input_for_rsem.sam.gz': - - convert-sam-for-rsem /ref/mouse_125 input.sam | gzip > input_for_rsem.sam.gz +Suppose input is set to 'input.sam' and output file name is "output" -2) Output to 'input_for_rsem.sam' directly: + convert-sam-for-rsem input.sam output - convert-sam-for-rsem /ref/mouse_125 input.sam -o input_for_rsem.sam +We will get a file called 'output.bam' as output. =cut