X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=convert-sam-for-rsem;h=348354b29a2f91e707575b33b374d2752b7c563a;hb=97554bbac838f2ed578d81f98e421dac0669e74e;hp=68a42d7ea3cc468a9f96093da31c3d784e103695;hpb=229ba9d68e0a801907631887640ab475d51c560c;p=rsem.git diff --git a/convert-sam-for-rsem b/convert-sam-for-rsem index 68a42d7..348354b 100755 --- a/convert-sam-for-rsem +++ b/convert-sam-for-rsem @@ -4,9 +4,7 @@ use Getopt::Long; use Pod::Usage; use strict; -my $standard_output = "&STDOUT"; - -my $out_file = $standard_output; +my $out_file = ""; my @tmp_dirs = (); my $help = 0; @@ -16,86 +14,19 @@ GetOptions("o=s" => \$out_file, pod2usage(-verbose => 2) if ($help == 1); -pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 2); +pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 1); my $command; -my (@fields, @header) = (); -my $M; - -# Load fields -my @lines = (); -my $type; - -open(INPUT, "$ARGV[0].ti"); -@lines = ; -chomp(@lines); -close(INPUT); - -@fields = (); -($M, $type) = split(/ /, $lines[0]); -for (my $i = 0; $i < $M; $i++) { - push(@fields, "SN:$lines[$i * 6 + 1]"); -} - - -# Reorder header -my $line; - -open(INPUT, $ARGV[1]); -@header = (); -while (($line = ) && substr($line, 0, 1) eq '@') { - chomp($line); - push(@header, $line); -} -close(INPUT); - -my $n = scalar(@header); -if ($n > 0) { - my %hash = (); - my @ktable = (); - - my $tid = 0; - - for (my $i = 0; $i < $n; $i++) { - my @arr = split(/\t/, $header[$i]); - if ($arr[0] ne "\@SQ") { push(@ktable, ""); next; } - my $hasSN = 0; - foreach my $key (@arr) { - if (substr($key, 0, 3) eq "SN:") { - $hash{$key} = $i; - $hasSN = 1; - last; - } - } - if (!$hasSN) { print STDERR "\"$header[$i]\" does not have a SN tag!\n"; exit(-1); } - push(@ktable, $fields[$tid++]); - } - - if ($tid != $M) { print STDERR "Number of \@SQ lines is not correct!\n"; exit(-1); } - open(OUTPUT, ">$out_file"); - for (my $i = 0; $i < $n; $i++) { - if ($ktable[$i] eq "") { print OUTPUT $header[$i]; } - else { print OUTPUT $header[$hash{$ktable[$i]}]; } - print OUTPUT "\n"; - } - close(OUTPUT); -} - - -# extract alignment section -$command = "grep ^[^@] $ARGV[1] > $ARGV[1].__temp"; +# grep header section +$command = "grep ^@ $ARGV[0]"; +if ($out_file ne "") { $command .= " > $out_file"; } &runCommand($command); -# sort and output the alignment section -$command = "sort -k 1,1 -s"; +# sort alignment section +$command = "grep ^[^@] $ARGV[0] | sort -k 1,1 -s"; if (scalar(@tmp_dirs) > 0) { $" = " -T "; $command .= " -T @tmp_dirs"; } -$command .= " $ARGV[1].__temp"; -if ($out_file ne $standard_output) { $command .= " >> $out_file"; } -&runCommand($command); - -# delete temporary files -$command = "rm -f $ARGV[1].__temp"; +if ($out_file ne "") { $command .= " >> $out_file"; } &runCommand($command); # finish @@ -125,7 +56,7 @@ convert-sam-for-rsem =over - convert-sam-for-rsem [options] reference_name input_sam + convert-sam-for-rsem [options] input_sam =back @@ -133,10 +64,6 @@ convert-sam-for-rsem =over -=item B - -The name of the reference used. This should be the same name used by 'rsem-prepare-reference'. - =item B The SAM file (*.sam) generated by user's aligner. If the aligner produces a BAM file, please use samtools to convert it to a SAM file (with header information). @@ -165,20 +92,20 @@ Show help information. This program converts the SAM file generated by user's aligner into a SAM file which RSEM can process. However, users should make sure their aligners use 'reference_name.idx.fa' generated by 'rsem-prepare-reference' as their references. In addition, their aligners should output header information and make two mates of the same alignment adjacent to each other for paired-end data. This program will output the converted file into standard output by default for the purpose of piping. By setting '-o' option, users can make the converted file written into disk. -Note: You do not need to run this script if Bowtie (not Bowtie 2) is used, or the order of @SQ lines is the same as 'reference_name.idx.fa' and the alignment lines of a same read group together and the mates of the same alignment are adjacent each other for paired-end reads. +Note: You do not need to run this script if Bowtie (not Bowtie 2) is used, or the alignment lines of a same read group together and the mates of the same alignment are adjacent each other for paired-end reads. Note: This program can only recognize SAM files. See ARGUMENTS section. =head1 EXAMPLES -Suppose reference_name and input_sam are set to '/ref/mouse_125' and 'input.sam'. +Suppose input_sam is set to 'input.sam'. 1) Output to standard output and gzip the output to 'input_for_rsem.sam.gz': - convert-sam-for-rsem /ref/mouse_125 input.sam | gzip > input_for_rsem.sam.gz + convert-sam-for-rsem input.sam | gzip > input_for_rsem.sam.gz 2) Output to 'input_for_rsem.sam' directly: - convert-sam-for-rsem /ref/mouse_125 input.sam -o input_for_rsem.sam + convert-sam-for-rsem input.sam -o input_for_rsem.sam =cut