-#!/usr/bin/env perl
-#
-#
-# Script to convert GERALD export files to SAM format.
-#
-#
-#
-########## License:
-#
-# The MIT License
-#
-# Original SAMtools version 0.1.2 copyright (c) 2008-2009 Genome Research Ltd.
-# Modifications from version 0.1.2 to 2.0.0 copyright (c) 2010 Illumina, Inc.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-#
-#
-#
-########## ChangeLog:
-#
-# Version: 2.0.0 (15FEB2010)
-# Script updated by Illumina in conjunction with CASAVA 1.7.0 release.
-# Major changes are as follows:
-# - The CIGAR string has been updated to include all gaps from ELANDv2 alignments.
-# - The ELAND single read alignment score is always stored in the optional "SM" field
-# and the ELAND paired read alignment score is stored in the optional "AS" field
-# when it exists.
-# - The MAPQ value is set to the higher of the two alignment scores, but no greater
-# than 254, i.e. min(254,max(SM,AS))
-# - The SAM "proper pair" bit (0x0002) is now set for read pairs meeting ELAND's
-# expected orientation and insert size criteria.
-# - The default quality score translation is set for export files which contain
-# Phread+64 quality values. An option, "--qlogodds", has been added to
-# translate quality values from the Solexa+64 format used in export files prior
-# to Pipeline 1.3
-# - The export match descriptor is now reverse-complemented when necessary such that
-# it always corresponds to the forward strand of the reference, to be consistent
-# with other information in the SAM record. It is now written to the optional
-# 'XD' field (rather than 'MD') to acknowledge its minor differences from the
-# samtools match descriptor (see additional detail below).
-# - An option, "--nofilter", has been added to include reads which have failed
-# primary analysis quality filtration. Such reads will have the corresponding
-# SAM flag bit (0x0200) set.
-# - Labels in the export 'contig' field are preserved by setting RNAME to
-# "$export_chromosome/$export_contig" when then contig label exists.
-#
-#
-# Contact: lh3
-# Version: 0.1.2 (03JAN2009)
-#
-#
-#
-########## Known Conversion Limitations:
-#
-# - Export records for reads that map to a position < 1 (allowed in export format), are converted
-# to unmapped reads in the SAM record.
-# - Export records contain the reserved chromosome names: "NM" and "QC". "NM" indicates that the
-# aligner could not map the read to the reference sequence set, and "QC" means that the
-# aligner did not attempt to map the read due to some technical limitation. Both of these
-# alignment types are collapsed to the single unmapped alignment state in the SAM record.
-# - The export match descriptor is slightly different than the samtools match descriptor. For
-# this reason it is stored in the optional SAM field 'XD' (and not 'MD'). Note that the
-# export match descriptor differs from the samtools version in two respects: (1) indels
-# are explicitly closed with the '$' character and (2) insertions must be enumerated in
-# the match descriptor. For example a 35-base read with a two-base insertion is described
-# as: 20^2$14
-#
-#
-#
-
-my $version = "2.0.0";
-
-use strict;
-use warnings;
-
-use File::Spec qw(splitpath);
-use Getopt::Long;
-use List::Util qw(min max);
-
-
-use constant {
- EXPORT_INDEX => 6,
- EXPORT_READNO => 7,
- EXPORT_READ => 8,
- EXPORT_QUAL => 9,
- EXPORT_CHROM => 10,
- EXPORT_CONTIG => 11,
- EXPORT_POS => 12,
- EXPORT_STRAND => 13,
- EXPORT_MD => 14,
- EXPORT_SEMAP => 15,
- EXPORT_PEMAP => 16,
- EXPORT_PASSFILT => 21,
-};
-
-
-use constant {
- SAM_QNAME => 0,
- SAM_FLAG => 1,
- SAM_RNAME => 2,
- SAM_POS => 3,
- SAM_MAPQ => 4,
- SAM_CIGAR => 5,
- SAM_MRNM => 6,
- SAM_MPOS => 7,
- SAM_ISIZE => 8,
- SAM_SEQ => 9,
- SAM_QUAL => 10,
-};
-
-
-# function prototypes for Richard's code
-sub match_desc_to_cigar($);
-sub match_desc_frag_length($);
-sub reverse_compl_match_descriptor($);
-sub write_header($;$;$);
-
-
-&export2sam;
-exit;
-
-
-
-
-sub export2sam {
-
- my $cmdline = $0 . " " . join(" ",@ARGV);
- my $arg_count = scalar @ARGV;
- my @spval = File::Spec->splitpath($0);
- my $progname = $spval[2];
-
- my $is_logodds_qvals = 0; # if true, assume files contain logodds (i.e. "solexa") quality values
- my $is_nofilter = 0;
- my $read1file;
- my $read2file;
- my $print_version = 0;
- my $help = 0;
-
- my $result = GetOptions( "qlogodds" => \$is_logodds_qvals,
- "nofilter" => \$is_nofilter,
- "read1=s" => \$read1file,
- "read2=s" => \$read2file,
- "version" => \$print_version,
- "help" => \$help );
-
- my $usage = <<END;
-
-$progname converts GERALD export files to SAM format.
-
-Usage: $progname --read1=FILENAME [ options ] | --version | --help
-
- --read1=FILENAME read1 export file (mandatory)
- --read2=FILENAME read2 export file
- --nofilter include reads that failed the pipeline/RTA purity filter
- --qlogodds assume export file(s) use logodds quality values as reported
- by pipeline prior to v1.3 (default: phred quality values)
-
-END
-
- my $version_msg = <<END;
-
-$progname version: $version
-
-END
-
- if((not $result) or $help or ($arg_count==0)) {
- die($usage);
- }
-
- if(@ARGV) {
- print STDERR "\nERROR: Unrecognized arguments: " . join(" ",@ARGV) . "\n\n";
- die($usage);
- }
-
- if($print_version) {
- die($version_msg);
- }
-
- if(not defined($read1file)) {
- print STDERR "\nERROR: read1 export file must be specified\n\n";
- die($usage);
- }
-
- my ($fh1, $fh2, $is_paired);
-
- open($fh1, $read1file) or die("\nERROR: Can't open read1 export file: $read1file\n\n");
- $is_paired = defined $read2file;
- if ($is_paired) {
- open($fh2, $read2file) or die("\nERROR: Can't open read2 export file: $read2file\n\n");
- }
- # quality value conversion table
- my @conv_table;
- if($is_logodds_qvals){ # convert from solexa+64 quality values (pipeline pre-v1.3):
- for (-64..64) {
- $conv_table[$_+64] = int(33 + 10*log(1+10**($_/10.0))/log(10)+.499);
- }
- } else { # convert from phred+64 quality values (pipeline v1.3+):
- for (-64..-1) {
- $conv_table[$_+64] = undef;
- }
- for (0..64) {
- $conv_table[$_+64] = int(33 + $_);
- }
- }
- # write the header
- print write_header( $progname, $version, $cmdline );
- # core loop
- my $export_line_count = 0;
- while (<$fh1>) {
- $export_line_count++;
- my (@s1, @s2);
- &export2sam_aux($_, $export_line_count, \@s1, \@conv_table, $is_paired, 1, $is_nofilter);
- if ($is_paired) {
- my $read2line = <$fh2>;
- if(not $read2line){
- die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read1 file at line no: $export_line_count.\n\n");
- }
- &export2sam_aux($read2line, $export_line_count, \@s2, \@conv_table, $is_paired, 2, $is_nofilter);
-
- if (@s1 && @s2) { # then set mate coordinate
- if($s1[SAM_QNAME] ne $s2[SAM_QNAME]){
- die("\nERROR: Non-paired reads in export files on line: $export_line_count.\n Read1: $_ Read2: $read2line\n");
- }
-
- my $isize = 0;
- if ($s1[SAM_RNAME] ne '*' && $s1[SAM_RNAME] eq $s2[SAM_RNAME]) { # then calculate $isize
- my $x1 = ($s1[SAM_FLAG] & 0x10)? $s1[SAM_POS] + length($s1[SAM_SEQ]) : $s1[SAM_POS];
- my $x2 = ($s2[SAM_FLAG] & 0x10)? $s2[SAM_POS] + length($s2[SAM_SEQ]) : $s2[SAM_POS];
- $isize = $x2 - $x1;
- }
-
- foreach ([\@s1,\@s2,$isize],[\@s2,\@s1,-$isize]){
- my ($sa,$sb,$is) = @{$_};
- if ($sb->[SAM_RNAME] ne '*') {
- $sa->[SAM_MRNM] = ($sb->[SAM_RNAME] eq $sa->[SAM_RNAME]) ? "=" : $sb->[SAM_RNAME];
- $sa->[SAM_MPOS] = $sb->[SAM_POS];
- $sa->[SAM_ISIZE] = $is;
- $sa->[SAM_FLAG] |= 0x20 if ($sb->[SAM_FLAG] & 0x10);
- } else {
- $sa->[SAM_FLAG] |= 0x8;
- }
- }
- }
- }
- print join("\t", @s1), "\n" if (@s1);
- print join("\t", @s2), "\n" if (@s2 && $is_paired);
- }
- close($fh1);
- if($is_paired) {
- while(my $read2line = <$fh2>){
- $export_line_count++;
- die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read2 file at line no: $export_line_count.\n\n");
- }
- close($fh2);
- }
-}
-
-sub export2sam_aux {
- my ($line, $line_no, $s, $ct, $is_paired, $read_no, $is_nofilter) = @_;
- chomp($line);
- my @t = split("\t", $line);
- @$s = ();
- my $isPassFilt = ($t[EXPORT_PASSFILT] eq 'Y');
- return if(not ($isPassFilt or $is_nofilter));
- # read name
- $s->[SAM_QNAME] = $t[1]? "$t[0]_$t[1]:$t[2]:$t[3]:$t[4]:$t[5]" : "$t[0]:$t[2]:$t[3]:$t[4]:$t[5]";
- # initial flag (will be updated later)
- $s->[SAM_FLAG] = 0;
- if($is_paired) {
- if($t[EXPORT_READNO] != $read_no){
- die("\nERROR: read$read_no export file contains record with read number: " .$t[EXPORT_READNO] . " on line: $line_no\n\n");
- }
- $s->[SAM_FLAG] |= 1 | 1<<(5 + $read_no);
- }
- $s->[SAM_FLAG] |= 0x200 if (not $isPassFilt);
-
- # read & quality
- my $is_export_rev = ($t[EXPORT_STRAND] eq 'R');
- if ($is_export_rev) { # then reverse the sequence and quality
- $s->[SAM_SEQ] = reverse($t[EXPORT_READ]);
- $s->[SAM_SEQ] =~ tr/ACGTacgt/TGCAtgca/;
- $s->[SAM_QUAL] = reverse($t[EXPORT_QUAL]);
- } else {
- $s->[SAM_SEQ] = $t[EXPORT_READ];
- $s->[SAM_QUAL] = $t[EXPORT_QUAL];
- }
- my @convqual = ();
- foreach (unpack('C*', $s->[SAM_QUAL])){
- my $val=$ct->[$_];
- if(not defined $val){
- my $msg="\nERROR: can't interpret export quality value: " . $_ . " in read$read_no export file, line: $line_no\n";
- if( $_ < 64 ) { $msg .= " Use --qlogodds flag to translate logodds (solexa) quality values.\n"; }
- die($msg . "\n");
- }
- push @convqual,$val;
- }
-
- $s->[SAM_QUAL] = pack('C*',@convqual); # change coding
-
-
- # coor
- my $has_coor = 0;
- $s->[SAM_RNAME] = "*";
- if ($t[EXPORT_CHROM] eq 'NM' or $t[EXPORT_CHROM] eq 'QC') {
- $s->[SAM_FLAG] |= 0x4; # unmapped
- } elsif ($t[EXPORT_CHROM] =~ /(\d+):(\d+):(\d+)/) {
- $s->[SAM_FLAG] |= 0x4; # TODO: should I set BAM_FUNMAP in this case?
- push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3")
- } elsif ($t[EXPORT_POS] < 1) {
- $s->[SAM_FLAG] |= 0x4; # unmapped
- } else {
- $s->[SAM_RNAME] = $t[EXPORT_CHROM];
- $s->[SAM_RNAME] .= "/" . $t[EXPORT_CONTIG] if($t[EXPORT_CONTIG] ne '');
- $has_coor = 1;
- }
- $s->[SAM_POS] = $has_coor? $t[EXPORT_POS] : 0;
-
-# print STDERR "t[14] = " . $t[14] . "\n";
- my $matchDesc = '';
- $s->[SAM_CIGAR] = "*";
- if($has_coor){
- $matchDesc = ($is_export_rev) ? reverse_compl_match_descriptor($t[EXPORT_MD]) : $t[EXPORT_MD];
-
- if($matchDesc =~ /\^/){
- # construct CIGAR string using Richard's function
- $s->[SAM_CIGAR] = match_desc_to_cigar($matchDesc); # indel processing
- } else {
- $s->[SAM_CIGAR] = length($s->[SAM_SEQ]) . "M";
- }
- }
-
-# print STDERR "cigar_string = $cigar_string\n";
-
- $s->[SAM_FLAG] |= 0x10 if ($has_coor && $is_export_rev);
- if($has_coor){
- my $semap = ($t[EXPORT_SEMAP] ne '') ? $t[EXPORT_SEMAP] : 0;
- my $pemap = 0;
- if($is_paired) {
- $pemap = ($t[EXPORT_PEMAP] ne '') ? $t[EXPORT_PEMAP] : 0;
-
- # set `proper pair' bit if non-blank, non-zero PE alignment score:
- $s->[SAM_FLAG] |= 0x02 if ($pemap > 0);
- }
- $s->[SAM_MAPQ] = min(254,max($semap,$pemap));
- } else {
- $s->[SAM_MAPQ] = 0;
- }
- # mate coordinate
- $s->[SAM_MRNM] = '*';
- $s->[SAM_MPOS] = 0;
- $s->[SAM_ISIZE] = 0;
- # aux
- push(@$s, "BC:Z:$t[EXPORT_INDEX]") if ($t[EXPORT_INDEX]);
- if($has_coor){
- # The export match descriptor differs slightly from the samtools match descriptor.
- # In order for the converted SAM files to be as compliant as possible,
- # we put the export match descriptor in optional field 'XD' rather than 'MD':
- push(@$s, "XD:Z:$matchDesc");
- push(@$s, "SM:i:$t[EXPORT_SEMAP]") if ($t[EXPORT_SEMAP] ne '');
- push(@$s, "AS:i:$t[EXPORT_PEMAP]") if ($is_paired and ($t[EXPORT_PEMAP] ne ''));
- }
-}
-
-
-
-#
-# the following code is taken from Richard Shaw's sorted2sam.pl file
-#
-sub reverse_compl_match_descriptor($)
-{
-# print "\nREVERSING THE MATCH DESCRIPTOR!\n";
- my ($match_desc) = @_;
- my $rev_compl_match_desc = reverse($match_desc);
- $rev_compl_match_desc =~ tr/ACGT\^\$/TGCA\$\^/;
-
- # Unreverse the digits of numbers.
- $rev_compl_match_desc = join('',
- map {($_ =~ /\d+/)
- ? join('', reverse(split('', $_)))
- : $_} split(/(\d+)/,
- $rev_compl_match_desc));
-
- return $rev_compl_match_desc;
-}
-
-
-
-sub match_desc_to_cigar($)
-{
- my ($match_desc) = @_;
-
- my @match_desc_parts = split(/(\^.*?\$)/, $match_desc);
- my $cigar_str = '';
- my $cigar_del_ch = 'D';
- my $cigar_ins_ch = 'I';
- my $cigar_match_ch = 'M';
-
- foreach my $match_desc_part (@match_desc_parts) {
- next if (!$match_desc_part);
-
- if ($match_desc_part =~ /^\^([ACGTN]+)\$$/) {
- # Deletion
- $cigar_str .= (length($1) . $cigar_del_ch);
- } elsif ($match_desc_part =~ /^\^(\d+)\$$/) {
- # Insertion
- $cigar_str .= ($1 . $cigar_ins_ch);
- } else {
- $cigar_str .= (match_desc_frag_length($match_desc_part)
- . $cigar_match_ch);
- }
- }
-
- return $cigar_str;
-}
-
-
-#------------------------------------------------------------------------------
-
-sub match_desc_frag_length($)
- {
- my ($match_desc_str) = @_;
- my $len = 0;
-
- my @match_desc_fields = split(/([ACGTN]+)/, $match_desc_str);
-
- foreach my $match_desc_field (@match_desc_fields) {
- next if ($match_desc_field eq '');
-
- $len += (($match_desc_field =~ /(\d+)/)
- ? $1 : length($match_desc_field));
- }
-
- return $len;
-}
-
-
-# argument holds the command line
-sub write_header($;$;$)
-{
- my ($progname,$version,$cl) = @_;
- my $complete_header = "";
- $complete_header .= "\@PG\tID:$progname\tVN:$version\tCL:$cl\n";
-
- return $complete_header;
-}
+#!/usr/bin/env perl\r
+#\r
+#\r
+# export2sam.pl converts GERALD export files to SAM format.\r
+#\r
+#\r
+#\r
+########## License:\r
+#\r
+# The MIT License\r
+#\r
+# Original SAMtools work copyright (c) 2008-2009 Genome Research Ltd.\r
+# Modified SAMtools work copyright (c) 2010 Illumina, Inc.\r
+#\r
+# Permission is hereby granted, free of charge, to any person obtaining a copy\r
+# of this software and associated documentation files (the "Software"), to deal\r
+# in the Software without restriction, including without limitation the rights\r
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\r
+# copies of the Software, and to permit persons to whom the Software is\r
+# furnished to do so, subject to the following conditions:\r
+#\r
+# The above copyright notice and this permission notice shall be included in\r
+# all copies or substantial portions of the Software.\r
+# \r
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\r
+# THE SOFTWARE.\r
+#\r
+#\r
+#\r
+#\r
+########## ChangeLog:\r
+#\r
+# Version: 2.3.1 (18MAR2011)\r
+#\r
+# - Restore file '-' as stdin input.\r
+#\r
+# Version: 2.3.0 (24JAN2011)\r
+#\r
+# - Add support for export reserved chromosome name "CONTROL",\r
+# which is translated to optional field "XC:Z:CONTROL".\r
+# - Check for ".gz" file extension on export files and open\r
+# these as gzip pipes when the extension is found.\r
+#\r
+# Version: 2.2.0 (16NOV2010)\r
+#\r
+# - Remove any leading zeros in export fields: RUNNO,LANE,TILE,X,Y\r
+# - For export records with reserved chromosome name identifiers\r
+# "QC" and "RM", add the optional field "XC:Z:QC" or "XC:Z:RM"\r
+# to the SAM record, so that these cases can be distinguished\r
+# from other unmatched reads.\r
+#\r
+# Version: 2.1.0 (21SEP2010)\r
+#\r
+# - Additional export record error checking.\r
+# - Convert export records with chromomsome value of "RM" to unmapped \r
+# SAM records.\r
+#\r
+# Version: 2.0.0 (15FEB2010)\r
+#\r
+# Script updated by Illumina in conjunction with CASAVA 1.7.0\r
+# release.\r
+#\r
+# Major changes are as follows:\r
+# - The CIGAR string has been updated to include all gaps from\r
+# ELANDv2 alignments.\r
+# - The ELAND single read alignment score is always stored in the\r
+# optional "SM" field and the ELAND paired read alignment score\r
+# is stored in the optional "AS" field when it exists.\r
+# - The MAPQ value is set to the higher of the two alignment scores,\r
+# but no greater than 254, i.e. min(254,max(SM,AS))\r
+# - The SAM "proper pair" bit (0x0002) is now set for read pairs\r
+# meeting ELAND's expected orientation and insert size criteria.\r
+# - The default quality score translation is set for export files\r
+# which contain Phread+64 quality values. An option,\r
+# "--qlogodds", has been added to translate quality values from\r
+# the Solexa+64 format used in export files prior to Pipeline\r
+# 1.3\r
+# - The export match descriptor is now reverse-complemented when\r
+# necessary such that it always corresponds to the forward\r
+# strand of the reference, to be consistent with other\r
+# information in the SAM record. It is now written to the\r
+# optional 'XD' field (rather than 'MD') to acknowledge its\r
+# minor differences from the samtools match descriptor (see\r
+# additional detail below).\r
+# - An option, "--nofilter", has been added to include reads which\r
+# have failed primary analysis quality filtration. Such reads\r
+# will have the corresponding SAM flag bit (0x0200) set.\r
+# - Labels in the export 'contig' field are preserved by setting\r
+# RNAME to "$export_chromosome/$export_contig" when the contig\r
+# label exists.\r
+#\r
+#\r
+# Contact: lh3\r
+# Version: 0.1.2 (03JAN2009)\r
+#\r
+#\r
+#\r
+########## Known Conversion Limitations:\r
+#\r
+# - Export records for reads that map to a position < 1 (allowed\r
+# in export format), are converted to unmapped reads in the SAM\r
+# record.\r
+# - Export records contain the reserved chromosome names: "NM",\r
+# "QC","RM" and "CONTROL". "NM" indicates that the aligner could\r
+# not map the read to the reference sequence set. "QC" means that\r
+# the aligner did not attempt to map the read due to some\r
+# technical limitation. "RM" means that the read mapped to a set\r
+# of 'contaminant' sequences specified in GERALD's RNA-seq\r
+# workflow. "CONTROL" means that the read is a control. All of\r
+# these alignment types are collapsed to the single unmapped\r
+# alignment state in the SAM record, but the optional SAM "XC"\r
+# field is used to record the original reserved chromosome name of\r
+# the read for all but the "NM" case.\r
+# - The export match descriptor is slightly different than the\r
+# samtools match descriptor. For this reason it is stored in the\r
+# optional SAM field 'XD' (and not 'MD'). Note that the export\r
+# match descriptor differs from the samtools version in two\r
+# respects: (1) indels are explicitly closed with the '$'\r
+# character and (2) insertions must be enumerated in the match\r
+# descriptor. For example a 35-base read with a two-base insertion\r
+# is described as: 20^2$14\r
+#\r
+#\r
+#\r
+\r
+my $version = "2.3.1";\r
+\r
+use strict;\r
+use warnings;\r
+\r
+use Getopt::Long;\r
+use File::Spec;\r
+use List::Util qw(min max);\r
+\r
+\r
+use constant {\r
+ EXPORT_MACHINE => 0,\r
+ EXPORT_RUNNO => 1,\r
+ EXPORT_LANE => 2,\r
+ EXPORT_TILE => 3,\r
+ EXPORT_X => 4,\r
+ EXPORT_Y => 5,\r
+ EXPORT_INDEX => 6,\r
+ EXPORT_READNO => 7,\r
+ EXPORT_READ => 8,\r
+ EXPORT_QUAL => 9,\r
+ EXPORT_CHROM => 10,\r
+ EXPORT_CONTIG => 11,\r
+ EXPORT_POS => 12,\r
+ EXPORT_STRAND => 13, \r
+ EXPORT_MD => 14,\r
+ EXPORT_SEMAP => 15,\r
+ EXPORT_PEMAP => 16,\r
+ EXPORT_PASSFILT => 21,\r
+ EXPORT_SIZE => 22,\r
+};\r
+\r
+\r
+use constant {\r
+ SAM_QNAME => 0,\r
+ SAM_FLAG => 1,\r
+ SAM_RNAME => 2,\r
+ SAM_POS => 3,\r
+ SAM_MAPQ => 4,\r
+ SAM_CIGAR => 5,\r
+ SAM_MRNM => 6,\r
+ SAM_MPOS => 7,\r
+ SAM_ISIZE => 8,\r
+ SAM_SEQ => 9,\r
+ SAM_QUAL => 10,\r
+};\r
+\r
+\r
+# function prototypes for Richard's code\r
+sub match_desc_to_cigar($);\r
+sub match_desc_frag_length($);\r
+sub reverse_compl_match_descriptor($);\r
+sub write_header($;$;$);\r
+\r
+\r
+&export2sam;\r
+exit;\r
+\r
+\r
+\r
+\r
+sub export2sam {\r
+\r
+ my $cmdline = $0 . " " . join(" ",@ARGV);\r
+ my $arg_count = scalar @ARGV;\r
+ my $progname = (File::Spec->splitpath($0))[2];\r
+\r
+ my $is_logodds_qvals = 0; # if true, assume files contain logodds (i.e. "solexa") quality values\r
+ my $is_nofilter = 0;\r
+ my $read1file;\r
+ my $read2file;\r
+ my $print_version = 0;\r
+ my $help = 0;\r
+\r
+ my $result = GetOptions( "qlogodds" => \$is_logodds_qvals, \r
+ "nofilter" => \$is_nofilter,\r
+ "read1=s" => \$read1file,\r
+ "read2=s" => \$read2file,\r
+ "version" => \$print_version,\r
+ "help" => \$help );\r
+\r
+ my $usage = <<END;\r
+\r
+$progname converts GERALD export files to SAM format.\r
+\r
+Usage: $progname --read1=FILENAME [ options ] | --version | --help\r
+\r
+ --read1=FILENAME read1 export file or '-' for stdin (mandatory)\r
+ (file may be gzipped with ".gz" extension)\r
+ --read2=FILENAME read2 export file or '-' for stdin\r
+ (file may be gzipped with ".gz" extension)\r
+ --nofilter include reads that failed the basecaller\r
+ purity filter\r
+ --qlogodds assume export file(s) use logodds quality values\r
+ as reported by OLB (Pipeline) prior to v1.3\r
+ (default: phred quality values)\r
+\r
+END\r
+\r
+ my $version_msg = <<END;\r
+\r
+$progname version: $version\r
+\r
+END\r
+\r
+ if((not $result) or $help or ($arg_count==0)) {\r
+ die($usage);\r
+ }\r
+\r
+ if(@ARGV) {\r
+ print STDERR "\nERROR: Unrecognized arguments: " . join(" ",@ARGV) . "\n\n";\r
+ die($usage);\r
+ }\r
+\r
+ if($print_version) {\r
+ die($version_msg);\r
+ }\r
+\r
+ if(not defined($read1file)) {\r
+ print STDERR "\nERROR: read1 export file must be specified\n\n";\r
+ die($usage);\r
+ }\r
+\r
+ unless((-f $read1file) or ($read1file eq '-')) {\r
+ die("\nERROR: Can't find read1 export file: '$read1file'\n\n");\r
+ }\r
+\r
+ if (defined $read2file) {\r
+ unless((-f $read2file) or ($read2file eq '-')) {\r
+ die("\nERROR: Can't find read2 export file: '$read2file'\n\n");\r
+ }\r
+ if($read1file eq $read2file) {\r
+ die("\nERROR: read1 and read2 export filenames are the same: '$read1file'\n\n");\r
+ }\r
+ }\r
+\r
+ my ($fh1, $fh2, $is_paired);\r
+\r
+ my $read1cmd="$read1file";\r
+ $read1cmd = "gzip -dc $read1file |" if($read1file =~ /\.gz$/);\r
+ open($fh1, $read1cmd)\r
+ or die("\nERROR: Can't open read1 process: '$read1cmd'\n\n");\r
+ $is_paired = defined $read2file;\r
+ if ($is_paired) {\r
+ my $read2cmd="$read2file";\r
+ $read2cmd = "gzip -dc $read2file |" if($read2file =~ /\.gz$/);\r
+ open($fh2, $read2cmd)\r
+ or die("\nERROR: Can't open read2 process: '$read2cmd'\n\n");\r
+ }\r
+ # quality value conversion table\r
+ my @conv_table;\r
+ if($is_logodds_qvals){ # convert from solexa+64 quality values (pipeline pre-v1.3):\r
+ for (-64..64) {\r
+ $conv_table[$_+64] = int(33 + 10*log(1+10**($_/10.0))/log(10)+.499);\r
+ }\r
+ } else { # convert from phred+64 quality values (pipeline v1.3+):\r
+ for (-64..-1) {\r
+ $conv_table[$_+64] = undef;\r
+ }\r
+ for (0..64) {\r
+ $conv_table[$_+64] = int(33 + $_);\r
+ }\r
+ }\r
+ # write the header\r
+ print write_header( $progname, $version, $cmdline );\r
+ # core loop\r
+ my $export_line_count = 0;\r
+ while (<$fh1>) {\r
+ $export_line_count++;\r
+ my (@s1, @s2);\r
+ &export2sam_aux($_, $export_line_count, \@s1, \@conv_table, $is_paired, 1, $is_nofilter);\r
+ if ($is_paired) {\r
+ my $read2line = <$fh2>;\r
+ if(not $read2line){\r
+ die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read1 file at line no: $export_line_count.\n\n");\r
+ }\r
+ &export2sam_aux($read2line, $export_line_count, \@s2, \@conv_table, $is_paired, 2, $is_nofilter);\r
+\r
+ if (@s1 && @s2) { # then set mate coordinate\r
+ if($s1[SAM_QNAME] ne $s2[SAM_QNAME]){\r
+ die("\nERROR: Non-paired reads in export files on line: $export_line_count.\n Read1: $_ Read2: $read2line\n");\r
+ }\r
+\r
+ my $isize = 0;\r
+ if ($s1[SAM_RNAME] ne '*' && $s1[SAM_RNAME] eq $s2[SAM_RNAME]) { # then calculate $isize\r
+ my $x1 = ($s1[SAM_FLAG] & 0x10)? $s1[SAM_POS] + length($s1[SAM_SEQ]) : $s1[SAM_POS];\r
+ my $x2 = ($s2[SAM_FLAG] & 0x10)? $s2[SAM_POS] + length($s2[SAM_SEQ]) : $s2[SAM_POS];\r
+ $isize = $x2 - $x1;\r
+ }\r
+\r
+ foreach ([\@s1,\@s2,$isize],[\@s2,\@s1,-$isize]){ \r
+ my ($sa,$sb,$is) = @{$_};\r
+ if ($sb->[SAM_RNAME] ne '*') {\r
+ $sa->[SAM_MRNM] = ($sb->[SAM_RNAME] eq $sa->[SAM_RNAME]) ? "=" : $sb->[SAM_RNAME];\r
+ $sa->[SAM_MPOS] = $sb->[SAM_POS];\r
+ $sa->[SAM_ISIZE] = $is;\r
+ $sa->[SAM_FLAG] |= 0x20 if ($sb->[SAM_FLAG] & 0x10);\r
+ } else {\r
+ $sa->[SAM_FLAG] |= 0x8;\r
+ }\r
+ } \r
+ }\r
+ }\r
+ print join("\t", @s1), "\n" if (@s1);\r
+ print join("\t", @s2), "\n" if (@s2 && $is_paired);\r
+ }\r
+ close($fh1);\r
+ if($is_paired) {\r
+ while(my $read2line = <$fh2>){\r
+ $export_line_count++;\r
+ die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read2 file at line no: $export_line_count.\n\n");\r
+ }\r
+ close($fh2);\r
+ }\r
+}\r
+\r
+sub export2sam_aux {\r
+ my ($line, $line_no, $s, $ct, $is_paired, $read_no, $is_nofilter) = @_;\r
+ chomp($line);\r
+ my @t = split("\t", $line);\r
+ if(scalar(@t) < EXPORT_SIZE) {\r
+ my $msg="\nERROR: Unexpected number of fields in export record on line $line_no of read$read_no export file. Found " . scalar(@t) . " fields but expected " . EXPORT_SIZE . ".\n";\r
+ $msg.="\t...erroneous export record:\n" . $line . "\n\n";\r
+ die($msg);\r
+ }\r
+ @$s = ();\r
+ my $isPassFilt = ($t[EXPORT_PASSFILT] eq 'Y');\r
+ return if(not ($isPassFilt or $is_nofilter));\r
+ # read name\r
+ my $samQnamePrefix = $t[EXPORT_MACHINE] . (($t[EXPORT_RUNNO] ne "") ? "_" . int($t[EXPORT_RUNNO]) : "");\r
+ $s->[SAM_QNAME] = join(':', $samQnamePrefix, int($t[EXPORT_LANE]), int($t[EXPORT_TILE]),\r
+ int($t[EXPORT_X]), int($t[EXPORT_Y]));\r
+ # initial flag (will be updated later)\r
+ $s->[SAM_FLAG] = 0;\r
+ if($is_paired) {\r
+ if($t[EXPORT_READNO] != $read_no){\r
+ die("\nERROR: read$read_no export file contains record with read number: " .$t[EXPORT_READNO] . " on line: $line_no\n\n");\r
+ }\r
+ $s->[SAM_FLAG] |= 1 | 1<<(5 + $read_no);\r
+ }\r
+ $s->[SAM_FLAG] |= 0x200 if (not $isPassFilt);\r
+\r
+ # read & quality\r
+ my $is_export_rev = ($t[EXPORT_STRAND] eq 'R');\r
+ if ($is_export_rev) { # then reverse the sequence and quality\r
+ $s->[SAM_SEQ] = reverse($t[EXPORT_READ]);\r
+ $s->[SAM_SEQ] =~ tr/ACGTacgt/TGCAtgca/;\r
+ $s->[SAM_QUAL] = reverse($t[EXPORT_QUAL]);\r
+ } else {\r
+ $s->[SAM_SEQ] = $t[EXPORT_READ];\r
+ $s->[SAM_QUAL] = $t[EXPORT_QUAL];\r
+ }\r
+ my @convqual = ();\r
+ foreach (unpack('C*', $s->[SAM_QUAL])){\r
+ my $val=$ct->[$_];\r
+ if(not defined $val){\r
+ my $msg="\nERROR: can't interpret export quality value: " . $_ . " in read$read_no export file, line: $line_no\n";\r
+ if( $_ < 64 ) { $msg .= " Use --qlogodds flag to translate logodds (solexa) quality values.\n"; }\r
+ die($msg . "\n");\r
+ }\r
+ push @convqual,$val;\r
+ }\r
+\r
+ $s->[SAM_QUAL] = pack('C*',@convqual); # change coding\r
+\r
+\r
+ # coor\r
+ my $has_coor = 0;\r
+ $s->[SAM_RNAME] = "*";\r
+ if (($t[EXPORT_CHROM] eq 'NM') or\r
+ ($t[EXPORT_CHROM] eq 'QC') or\r
+ ($t[EXPORT_CHROM] eq 'RM') or\r
+ ($t[EXPORT_CHROM] eq 'CONTROL')) {\r
+ $s->[SAM_FLAG] |= 0x4; # unmapped\r
+ push(@$s,"XC:Z:".$t[EXPORT_CHROM]) if($t[EXPORT_CHROM] ne 'NM');\r
+ } elsif ($t[EXPORT_CHROM] =~ /(\d+):(\d+):(\d+)/) {\r
+ $s->[SAM_FLAG] |= 0x4; # TODO: should I set BAM_FUNMAP in this case?\r
+ push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3")\r
+ } elsif ($t[EXPORT_POS] < 1) {\r
+ $s->[SAM_FLAG] |= 0x4; # unmapped\r
+ } else {\r
+ $s->[SAM_RNAME] = $t[EXPORT_CHROM];\r
+ $s->[SAM_RNAME] .= "/" . $t[EXPORT_CONTIG] if($t[EXPORT_CONTIG] ne '');\r
+ $has_coor = 1;\r
+ }\r
+ $s->[SAM_POS] = $has_coor? $t[EXPORT_POS] : 0;\r
+\r
+# print STDERR "t[14] = " . $t[14] . "\n";\r
+ my $matchDesc = '';\r
+ $s->[SAM_CIGAR] = "*";\r
+ if($has_coor){\r
+ $matchDesc = ($is_export_rev) ? reverse_compl_match_descriptor($t[EXPORT_MD]) : $t[EXPORT_MD];\r
+\r
+ if($matchDesc =~ /\^/){\r
+ # construct CIGAR string using Richard's function\r
+ $s->[SAM_CIGAR] = match_desc_to_cigar($matchDesc); # indel processing\r
+ } else {\r
+ $s->[SAM_CIGAR] = length($s->[SAM_SEQ]) . "M";\r
+ }\r
+ }\r
+\r
+# print STDERR "cigar_string = $cigar_string\n";\r
+\r
+ $s->[SAM_FLAG] |= 0x10 if ($has_coor && $is_export_rev);\r
+ if($has_coor){\r
+ my $semap = ($t[EXPORT_SEMAP] ne '') ? $t[EXPORT_SEMAP] : 0;\r
+ my $pemap = 0;\r
+ if($is_paired) {\r
+ $pemap = ($t[EXPORT_PEMAP] ne '') ? $t[EXPORT_PEMAP] : 0;\r
+\r
+ # set `proper pair' bit if non-blank, non-zero PE alignment score:\r
+ $s->[SAM_FLAG] |= 0x02 if ($pemap > 0);\r
+ }\r
+ $s->[SAM_MAPQ] = min(254,max($semap,$pemap));\r
+ } else {\r
+ $s->[SAM_MAPQ] = 0;\r
+ }\r
+ # mate coordinate\r
+ $s->[SAM_MRNM] = '*';\r
+ $s->[SAM_MPOS] = 0;\r
+ $s->[SAM_ISIZE] = 0;\r
+ # aux\r
+ push(@$s, "BC:Z:$t[EXPORT_INDEX]") if ($t[EXPORT_INDEX]);\r
+ if($has_coor){\r
+ # The export match descriptor differs slightly from the samtools match descriptor.\r
+ # In order for the converted SAM files to be as compliant as possible,\r
+ # we put the export match descriptor in optional field 'XD' rather than 'MD':\r
+ push(@$s, "XD:Z:$matchDesc");\r
+ push(@$s, "SM:i:$t[EXPORT_SEMAP]") if ($t[EXPORT_SEMAP] ne '');\r
+ push(@$s, "AS:i:$t[EXPORT_PEMAP]") if ($is_paired and ($t[EXPORT_PEMAP] ne ''));\r
+ }\r
+}\r
+\r
+\r
+\r
+#\r
+# the following code is taken from Richard Shaw's sorted2sam.pl file\r
+#\r
+sub reverse_compl_match_descriptor($)\r
+{\r
+# print "\nREVERSING THE MATCH DESCRIPTOR!\n";\r
+ my ($match_desc) = @_;\r
+ my $rev_compl_match_desc = reverse($match_desc);\r
+ $rev_compl_match_desc =~ tr/ACGT\^\$/TGCA\$\^/;\r
+\r
+ # Unreverse the digits of numbers.\r
+ $rev_compl_match_desc = join('',\r
+ map {($_ =~ /\d+/)\r
+ ? join('', reverse(split('', $_)))\r
+ : $_} split(/(\d+)/,\r
+ $rev_compl_match_desc));\r
+\r
+ return $rev_compl_match_desc;\r
+}\r
+\r
+\r
+\r
+sub match_desc_to_cigar($)\r
+{\r
+ my ($match_desc) = @_;\r
+\r
+ my @match_desc_parts = split(/(\^.*?\$)/, $match_desc);\r
+ my $cigar_str = '';\r
+ my $cigar_del_ch = 'D';\r
+ my $cigar_ins_ch = 'I';\r
+ my $cigar_match_ch = 'M';\r
+\r
+ foreach my $match_desc_part (@match_desc_parts) {\r
+ next if (!$match_desc_part);\r
+\r
+ if ($match_desc_part =~ /^\^([ACGTN]+)\$$/) {\r
+ # Deletion\r
+ $cigar_str .= (length($1) . $cigar_del_ch);\r
+ } elsif ($match_desc_part =~ /^\^(\d+)\$$/) {\r
+ # Insertion\r
+ $cigar_str .= ($1 . $cigar_ins_ch);\r
+ } else {\r
+ $cigar_str .= (match_desc_frag_length($match_desc_part)\r
+ . $cigar_match_ch);\r
+ }\r
+ }\r
+\r
+ return $cigar_str;\r
+}\r
+\r
+\r
+#------------------------------------------------------------------------------\r
+\r
+sub match_desc_frag_length($)\r
+ {\r
+ my ($match_desc_str) = @_;\r
+ my $len = 0;\r
+\r
+ my @match_desc_fields = split(/([ACGTN]+)/, $match_desc_str);\r
+\r
+ foreach my $match_desc_field (@match_desc_fields) {\r
+ next if ($match_desc_field eq '');\r
+\r
+ $len += (($match_desc_field =~ /(\d+)/)\r
+ ? $1 : length($match_desc_field));\r
+ }\r
+\r
+ return $len;\r
+}\r
+\r
+\r
+# argument holds the command line\r
+sub write_header($;$;$) \r
+{\r
+ my ($progname,$version,$cl) = @_;\r
+ my $complete_header = "";\r
+ $complete_header .= "\@PG\tID:$progname\tVN:$version\tCL:$cl\n";\r
+\r
+ return $complete_header;\r
+}\r