4 # Script to convert GERALD export files to SAM format.
12 # Original SAMtools version 0.1.2 copyright (c) 2008-2009 Genome Research Ltd.
13 # Modifications from version 0.1.2 to 2.0.0 copyright (c) 2010 Illumina, Inc.
15 # Permission is hereby granted, free of charge, to any person obtaining a copy
16 # of this software and associated documentation files (the "Software"), to deal
17 # in the Software without restriction, including without limitation the rights
18 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
19 # copies of the Software, and to permit persons to whom the Software is
20 # furnished to do so, subject to the following conditions:
22 # The above copyright notice and this permission notice shall be included in
23 # all copies or substantial portions of the Software.
25 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
30 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
37 # Version: 2.0.0 (15FEB2010)
38 # Script updated by Illumina in conjunction with CASAVA 1.7.0 release.
39 # Major changes are as follows:
40 # - The CIGAR string has been updated to include all gaps from ELANDv2 alignments.
41 # - The ELAND single read alignment score is always stored in the optional "SM" field
42 # and the ELAND paired read alignment score is stored in the optional "AS" field
44 # - The MAPQ value is set to the higher of the two alignment scores, but no greater
45 # than 254, i.e. min(254,max(SM,AS))
46 # - The SAM "proper pair" bit (0x0002) is now set for read pairs meeting ELAND's
47 # expected orientation and insert size criteria.
48 # - The default quality score translation is set for export files which contain
49 # Phread+64 quality values. An option, "--qlogodds", has been added to
50 # translate quality values from the Solexa+64 format used in export files prior
52 # - The export match descriptor is now reverse-complemented when necessary such that
53 # it always corresponds to the forward strand of the reference, to be consistent
54 # with other information in the SAM record. It is now written to the optional
55 # 'XD' field (rather than 'MD') to acknowledge its minor differences from the
56 # samtools match descriptor (see additional detail below).
57 # - An option, "--nofilter", has been added to include reads which have failed
58 # primary analysis quality filtration. Such reads will have the corresponding
59 # SAM flag bit (0x0200) set.
60 # - Labels in the export 'contig' field are preserved by setting RNAME to
61 # "$export_chromosome/$export_contig" when then contig label exists.
65 # Version: 0.1.2 (03JAN2009)
69 ########## Known Conversion Limitations:
71 # - Export records for reads that map to a position < 1 (allowed in export format), are converted
72 # to unmapped reads in the SAM record.
73 # - Export records contain the reserved chromosome names: "NM" and "QC". "NM" indicates that the
74 # aligner could not map the read to the reference sequence set, and "QC" means that the
75 # aligner did not attempt to map the read due to some technical limitation. Both of these
76 # alignment types are collapsed to the single unmapped alignment state in the SAM record.
77 # - The export match descriptor is slightly different than the samtools match descriptor. For
78 # this reason it is stored in the optional SAM field 'XD' (and not 'MD'). Note that the
79 # export match descriptor differs from the samtools version in two respects: (1) indels
80 # are explicitly closed with the '$' character and (2) insertions must be enumerated in
81 # the match descriptor. For example a 35-base read with a two-base insertion is described
87 my $version = "2.0.0";
92 use File::Spec qw(splitpath);
94 use List::Util qw(min max);
109 EXPORT_PASSFILT => 21,
128 # function prototypes for Richard's code
129 sub match_desc_to_cigar($);
130 sub match_desc_frag_length($);
131 sub reverse_compl_match_descriptor($);
132 sub write_header($;$;$);
143 my $cmdline = $0 . " " . join(" ",@ARGV);
144 my $arg_count = scalar @ARGV;
145 my @spval = File::Spec->splitpath($0);
146 my $progname = $spval[2];
148 my $is_logodds_qvals = 0; # if true, assume files contain logodds (i.e. "solexa") quality values
152 my $print_version = 0;
155 my $result = GetOptions( "qlogodds" => \$is_logodds_qvals,
156 "nofilter" => \$is_nofilter,
157 "read1=s" => \$read1file,
158 "read2=s" => \$read2file,
159 "version" => \$print_version,
164 $progname converts GERALD export files to SAM format.
166 Usage: $progname --read1=FILENAME [ options ] | --version | --help
168 --read1=FILENAME read1 export file (mandatory)
169 --read2=FILENAME read2 export file
170 --nofilter include reads that failed the pipeline/RTA purity filter
171 --qlogodds assume export file(s) use logodds quality values as reported
172 by pipeline prior to v1.3 (default: phred quality values)
176 my $version_msg = <<END;
178 $progname version: $version
182 if((not $result) or $help or ($arg_count==0)) {
187 print STDERR "\nERROR: Unrecognized arguments: " . join(" ",@ARGV) . "\n\n";
195 if(not defined($read1file)) {
196 print STDERR "\nERROR: read1 export file must be specified\n\n";
200 my ($fh1, $fh2, $is_paired);
202 open($fh1, $read1file) or die("\nERROR: Can't open read1 export file: $read1file\n\n");
203 $is_paired = defined $read2file;
205 open($fh2, $read2file) or die("\nERROR: Can't open read2 export file: $read2file\n\n");
207 # quality value conversion table
209 if($is_logodds_qvals){ # convert from solexa+64 quality values (pipeline pre-v1.3):
211 $conv_table[$_+64] = int(33 + 10*log(1+10**($_/10.0))/log(10)+.499);
213 } else { # convert from phred+64 quality values (pipeline v1.3+):
215 $conv_table[$_+64] = undef;
218 $conv_table[$_+64] = int(33 + $_);
222 print write_header( $progname, $version, $cmdline );
224 my $export_line_count = 0;
226 $export_line_count++;
228 &export2sam_aux($_, $export_line_count, \@s1, \@conv_table, $is_paired, 1, $is_nofilter);
230 my $read2line = <$fh2>;
232 die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read1 file at line no: $export_line_count.\n\n");
234 &export2sam_aux($read2line, $export_line_count, \@s2, \@conv_table, $is_paired, 2, $is_nofilter);
236 if (@s1 && @s2) { # then set mate coordinate
237 if($s1[SAM_QNAME] ne $s2[SAM_QNAME]){
238 die("\nERROR: Non-paired reads in export files on line: $export_line_count.\n Read1: $_ Read2: $read2line\n");
242 if ($s1[SAM_RNAME] ne '*' && $s1[SAM_RNAME] eq $s2[SAM_RNAME]) { # then calculate $isize
243 my $x1 = ($s1[SAM_FLAG] & 0x10)? $s1[SAM_POS] + length($s1[SAM_SEQ]) : $s1[SAM_POS];
244 my $x2 = ($s2[SAM_FLAG] & 0x10)? $s2[SAM_POS] + length($s2[SAM_SEQ]) : $s2[SAM_POS];
248 foreach ([\@s1,\@s2,$isize],[\@s2,\@s1,-$isize]){
249 my ($sa,$sb,$is) = @{$_};
250 if ($sb->[SAM_RNAME] ne '*') {
251 $sa->[SAM_MRNM] = ($sb->[SAM_RNAME] eq $sa->[SAM_RNAME]) ? "=" : $sb->[SAM_RNAME];
252 $sa->[SAM_MPOS] = $sb->[SAM_POS];
253 $sa->[SAM_ISIZE] = $is;
254 $sa->[SAM_FLAG] |= 0x20 if ($sb->[SAM_FLAG] & 0x10);
256 $sa->[SAM_FLAG] |= 0x8;
261 print join("\t", @s1), "\n" if (@s1);
262 print join("\t", @s2), "\n" if (@s2 && $is_paired);
266 while(my $read2line = <$fh2>){
267 $export_line_count++;
268 die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read2 file at line no: $export_line_count.\n\n");
275 my ($line, $line_no, $s, $ct, $is_paired, $read_no, $is_nofilter) = @_;
277 my @t = split("\t", $line);
279 my $isPassFilt = ($t[EXPORT_PASSFILT] eq 'Y');
280 return if(not ($isPassFilt or $is_nofilter));
282 $s->[SAM_QNAME] = $t[1]? "$t[0]_$t[1]:$t[2]:$t[3]:$t[4]:$t[5]" : "$t[0]:$t[2]:$t[3]:$t[4]:$t[5]";
283 # initial flag (will be updated later)
286 if($t[EXPORT_READNO] != $read_no){
287 die("\nERROR: read$read_no export file contains record with read number: " .$t[EXPORT_READNO] . " on line: $line_no\n\n");
289 $s->[SAM_FLAG] |= 1 | 1<<(5 + $read_no);
291 $s->[SAM_FLAG] |= 0x200 if (not $isPassFilt);
294 my $is_export_rev = ($t[EXPORT_STRAND] eq 'R');
295 if ($is_export_rev) { # then reverse the sequence and quality
296 $s->[SAM_SEQ] = reverse($t[EXPORT_READ]);
297 $s->[SAM_SEQ] =~ tr/ACGTacgt/TGCAtgca/;
298 $s->[SAM_QUAL] = reverse($t[EXPORT_QUAL]);
300 $s->[SAM_SEQ] = $t[EXPORT_READ];
301 $s->[SAM_QUAL] = $t[EXPORT_QUAL];
304 foreach (unpack('C*', $s->[SAM_QUAL])){
306 if(not defined $val){
307 my $msg="\nERROR: can't interpret export quality value: " . $_ . " in read$read_no export file, line: $line_no\n";
308 if( $_ < 64 ) { $msg .= " Use --qlogodds flag to translate logodds (solexa) quality values.\n"; }
314 $s->[SAM_QUAL] = pack('C*',@convqual); # change coding
319 $s->[SAM_RNAME] = "*";
320 if ($t[EXPORT_CHROM] eq 'NM' or $t[EXPORT_CHROM] eq 'QC') {
321 $s->[SAM_FLAG] |= 0x4; # unmapped
322 } elsif ($t[EXPORT_CHROM] =~ /(\d+):(\d+):(\d+)/) {
323 $s->[SAM_FLAG] |= 0x4; # TODO: should I set BAM_FUNMAP in this case?
324 push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3")
325 } elsif ($t[EXPORT_POS] < 1) {
326 $s->[SAM_FLAG] |= 0x4; # unmapped
328 $s->[SAM_RNAME] = $t[EXPORT_CHROM];
329 $s->[SAM_RNAME] .= "/" . $t[EXPORT_CONTIG] if($t[EXPORT_CONTIG] ne '');
332 $s->[SAM_POS] = $has_coor? $t[EXPORT_POS] : 0;
334 # print STDERR "t[14] = " . $t[14] . "\n";
336 $s->[SAM_CIGAR] = "*";
338 $matchDesc = ($is_export_rev) ? reverse_compl_match_descriptor($t[EXPORT_MD]) : $t[EXPORT_MD];
340 if($matchDesc =~ /\^/){
341 # construct CIGAR string using Richard's function
342 $s->[SAM_CIGAR] = match_desc_to_cigar($matchDesc); # indel processing
344 $s->[SAM_CIGAR] = length($s->[SAM_SEQ]) . "M";
348 # print STDERR "cigar_string = $cigar_string\n";
350 $s->[SAM_FLAG] |= 0x10 if ($has_coor && $is_export_rev);
352 my $semap = ($t[EXPORT_SEMAP] ne '') ? $t[EXPORT_SEMAP] : 0;
355 $pemap = ($t[EXPORT_PEMAP] ne '') ? $t[EXPORT_PEMAP] : 0;
357 # set `proper pair' bit if non-blank, non-zero PE alignment score:
358 $s->[SAM_FLAG] |= 0x02 if ($pemap > 0);
360 $s->[SAM_MAPQ] = min(254,max($semap,$pemap));
365 $s->[SAM_MRNM] = '*';
369 push(@$s, "BC:Z:$t[EXPORT_INDEX]") if ($t[EXPORT_INDEX]);
371 # The export match descriptor differs slightly from the samtools match descriptor.
372 # In order for the converted SAM files to be as compliant as possible,
373 # we put the export match descriptor in optional field 'XD' rather than 'MD':
374 push(@$s, "XD:Z:$matchDesc");
375 push(@$s, "SM:i:$t[EXPORT_SEMAP]") if ($t[EXPORT_SEMAP] ne '');
376 push(@$s, "AS:i:$t[EXPORT_PEMAP]") if ($is_paired and ($t[EXPORT_PEMAP] ne ''));
383 # the following code is taken from Richard Shaw's sorted2sam.pl file
385 sub reverse_compl_match_descriptor($)
387 # print "\nREVERSING THE MATCH DESCRIPTOR!\n";
388 my ($match_desc) = @_;
389 my $rev_compl_match_desc = reverse($match_desc);
390 $rev_compl_match_desc =~ tr/ACGT\^\$/TGCA\$\^/;
392 # Unreverse the digits of numbers.
393 $rev_compl_match_desc = join('',
395 ? join('', reverse(split('', $_)))
397 $rev_compl_match_desc));
399 return $rev_compl_match_desc;
404 sub match_desc_to_cigar($)
406 my ($match_desc) = @_;
408 my @match_desc_parts = split(/(\^.*?\$)/, $match_desc);
410 my $cigar_del_ch = 'D';
411 my $cigar_ins_ch = 'I';
412 my $cigar_match_ch = 'M';
414 foreach my $match_desc_part (@match_desc_parts) {
415 next if (!$match_desc_part);
417 if ($match_desc_part =~ /^\^([ACGTN]+)\$$/) {
419 $cigar_str .= (length($1) . $cigar_del_ch);
420 } elsif ($match_desc_part =~ /^\^(\d+)\$$/) {
422 $cigar_str .= ($1 . $cigar_ins_ch);
424 $cigar_str .= (match_desc_frag_length($match_desc_part)
433 #------------------------------------------------------------------------------
435 sub match_desc_frag_length($)
437 my ($match_desc_str) = @_;
440 my @match_desc_fields = split(/([ACGTN]+)/, $match_desc_str);
442 foreach my $match_desc_field (@match_desc_fields) {
443 next if ($match_desc_field eq '');
445 $len += (($match_desc_field =~ /(\d+)/)
446 ? $1 : length($match_desc_field));
453 # argument holds the command line
454 sub write_header($;$;$)
456 my ($progname,$version,$cl) = @_;
457 my $complete_header = "";
458 $complete_header .= "\@PG\tID:$progname\tVN:$version\tCL:$cl\n";
460 return $complete_header;