1 package Maasha::TwoBit;
3 # Copyright (C) 2008 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Stuff for interacting with the 2bit format as described here:
26 # http://genome.ucsc.edu/FAQ/FAQformat#format7
29 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
34 use vars qw( @ISA @EXPORT );
38 use Inline ( C => <<'END_C', DIRECTORY => $ENV{ "BP_TMP" } );
40 int find_block_beg( char *string, char c, int beg, int len )
42 /* Martin A. Hansen, March 2008 */
44 /* Given a string and a begin position, locates the next */
45 /* position in the string MATCHING a given char. */
46 /* This position is returned. If the char is not found -1 is returned. */
50 for ( i = beg; i < len; i++ )
52 if ( string[ i ] == c ) {
61 int find_block_len( char *string, char c, int beg, int len )
63 /* Martin A. Hansen, March 2008 */
65 /* Given a string and a begin position, locates the next length of */
66 /* a block consisting of a given char. The length of that block is returned. */
72 while ( i < len && string[ i ] == c )
81 char l2n[26] = { 2, 255, 1, 255, 255, 255, 3, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 255, 255, 255, 255, 255, 255 };
83 void dna2bin( char *raw, int size )
85 /* Khisanth from #perl March 2008 */
87 /* Encodes a DNA string to a bit array */
91 unsigned char packed_value = 0;
92 char *packed = malloc( size / 4 );
95 for( i = 0; i < size / 4; i++ ) {
96 packed_value = l2n[ raw[i*4] - 'A' ] << 6
97 | l2n[ raw[i*4+1] - 'A' ] << 4
98 | l2n[ raw[i*4+2] - 'A' ] << 2
99 | l2n[ raw[i*4+3] - 'A' ];
100 packed[i] = packed_value;
104 Inline_Stack_Push(sv_2mortal(newSVpvn(packed, size / 4 )));
111 "TTTT", "TTTC", "TTTA", "TTTG", "TTCT", "TTCC", "TTCA", "TTCG", "TTAT",
112 "TTAC", "TTAA", "TTAG", "TTGT", "TTGC", "TTGA", "TTGG", "TCTT", "TCTC",
113 "TCTA", "TCTG", "TCCT", "TCCC", "TCCA", "TCCG", "TCAT", "TCAC", "TCAA",
114 "TCAG", "TCGT", "TCGC", "TCGA", "TCGG", "TATT", "TATC", "TATA", "TATG",
115 "TACT", "TACC", "TACA", "TACG", "TAAT", "TAAC", "TAAA", "TAAG", "TAGT",
116 "TAGC", "TAGA", "TAGG", "TGTT", "TGTC", "TGTA", "TGTG", "TGCT", "TGCC",
117 "TGCA", "TGCG", "TGAT", "TGAC", "TGAA", "TGAG", "TGGT", "TGGC", "TGGA",
118 "TGGG", "CTTT", "CTTC", "CTTA", "CTTG", "CTCT", "CTCC", "CTCA", "CTCG",
119 "CTAT", "CTAC", "CTAA", "CTAG", "CTGT", "CTGC", "CTGA", "CTGG", "CCTT",
120 "CCTC", "CCTA", "CCTG", "CCCT", "CCCC", "CCCA", "CCCG", "CCAT", "CCAC",
121 "CCAA", "CCAG", "CCGT", "CCGC", "CCGA", "CCGG", "CATT", "CATC", "CATA",
122 "CATG", "CACT", "CACC", "CACA", "CACG", "CAAT", "CAAC", "CAAA", "CAAG",
123 "CAGT", "CAGC", "CAGA", "CAGG", "CGTT", "CGTC", "CGTA", "CGTG", "CGCT",
124 "CGCC", "CGCA", "CGCG", "CGAT", "CGAC", "CGAA", "CGAG", "CGGT", "CGGC",
125 "CGGA", "CGGG", "ATTT", "ATTC", "ATTA", "ATTG", "ATCT", "ATCC", "ATCA",
126 "ATCG", "ATAT", "ATAC", "ATAA", "ATAG", "ATGT", "ATGC", "ATGA", "ATGG",
127 "ACTT", "ACTC", "ACTA", "ACTG", "ACCT", "ACCC", "ACCA", "ACCG", "ACAT",
128 "ACAC", "ACAA", "ACAG", "ACGT", "ACGC", "ACGA", "ACGG", "AATT", "AATC",
129 "AATA", "AATG", "AACT", "AACC", "AACA", "AACG", "AAAT", "AAAC", "AAAA",
130 "AAAG", "AAGT", "AAGC", "AAGA", "AAGG", "AGTT", "AGTC", "AGTA", "AGTG",
131 "AGCT", "AGCC", "AGCA", "AGCG", "AGAT", "AGAC", "AGAA", "AGAG", "AGGT",
132 "AGGC", "AGGA", "AGGG", "GTTT", "GTTC", "GTTA", "GTTG", "GTCT", "GTCC",
133 "GTCA", "GTCG", "GTAT", "GTAC", "GTAA", "GTAG", "GTGT", "GTGC", "GTGA",
134 "GTGG", "GCTT", "GCTC", "GCTA", "GCTG", "GCCT", "GCCC", "GCCA", "GCCG",
135 "GCAT", "GCAC", "GCAA", "GCAG", "GCGT", "GCGC", "GCGA", "GCGG", "GATT",
136 "GATC", "GATA", "GATG", "GACT", "GACC", "GACA", "GACG", "GAAT", "GAAC",
137 "GAAA", "GAAG", "GAGT", "GAGC", "GAGA", "GAGG", "GGTT", "GGTC", "GGTA",
138 "GGTG", "GGCT", "GGCC", "GGCA", "GGCG", "GGAT", "GGAC", "GGAA", "GGAG",
139 "GGGT", "GGGC", "GGGA", "GGGG"
143 void bin2dna( char *raw, int size )
145 /* Khisanth from #perl, March 2008 */
147 /* Converts a bit array to DNA which is returned. */
150 char *unpacked = malloc( 4 * size + 1 );
153 unsigned char conv_index;
156 for( i = 0; i < size; i++ ) {
157 memset( &conv_index, raw[i], 1 );
158 memcpy( unpacked + i*4, conv[conv_index], 4);
162 Inline_Stack_Push(sv_2mortal(newSVpvn(unpacked, 4 * size)));
168 void bin2dna_old( char *bin, int bin_len )
170 /* Martin A. Hansen, March 2008 */
172 /* Converts a binary string to DNA which is returned. */
178 char *dna = ( char* )( malloc( bin_len / 2 ) );
182 for ( i = 1; i < bin_len; i += 2 )
184 if ( bin[ i - 1 ] == '1' )
186 if ( bin[ i ] == '1' ) {
194 if ( bin[ i ] == '1' ) {
205 Inline_Stack_Push( sv_2mortal( newSVpvn( dna, ( bin_len / 2 ) ) ) );
212 void hard_mask( char *seq, int beg, int len, int sub_beg, int sub_len )
214 /* Martin A. Hansen, March 2008 */
216 /* Hard masks a sequnce in a given interval, which is trimmed, */
217 /* if it does not match the sequence. */
219 int i, mask_beg, mask_len;
221 if ( sub_beg + sub_len >= beg && sub_beg <= beg + len )
223 mask_beg = beg - sub_beg;
225 if ( mask_beg < 0 ) {
231 if ( sub_len < mask_len ) {
235 for ( i = mask_beg; i < mask_beg + mask_len; i++ ) {
242 void soft_mask( char *seq, int beg, int len, int sub_beg, int sub_len )
244 /* Martin A. Hansen, March 2008 */
246 /* Soft masks a sequnce in a given interval, which is trimmed, */
247 /* if it does not match the sequence. */
249 int i, mask_beg, mask_len;
251 if ( sub_beg + sub_len >= beg && sub_beg <= beg + len )
253 mask_beg = beg - sub_beg;
255 if ( mask_beg < 0 ) {
261 if ( sub_len < mask_len ) {
265 for ( i = mask_beg; i < mask_beg + mask_len; i++ ) {
266 seq[ i ] = seq[ i ] ^ ' ';
283 @ISA = qw( Exporter );
286 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
291 # Martin A. Hansen, March 2008.
293 # Fetches the table of contents (TOC) from a 2bit file.
294 # The TOC is returned as a list of lists.
296 # The 2bit format is described here:
297 # http://genome.ucsc.edu/FAQ/FAQformat#format7
299 my ( $fh, # filehandle
304 my ( $signature, $version, $seq_count, $reserved, $i, $seq_name_size, $string, $seq_name, $offset, @AoA );
308 $signature = unpack_32bit( $fh );
309 $version = unpack_32bit( $fh );
310 $seq_count = unpack_32bit( $fh );
311 $reserved = unpack_32bit( $fh );
313 Maasha::Common::error( qq(2bit file signature didn't match - inverse bit order?) ) if $signature != 0x1A412743;
315 for ( $i = 0; $i < $seq_count; $i++ )
317 $seq_name_size = unpack_8bit( $fh );
319 sysread $fh, $string, $seq_name_size;
321 $seq_name = unpack( "A$seq_name_size", $string );
323 $offset = unpack_32bit( $fh );
325 push @AoA, [ $seq_name, $offset ];
328 return wantarray ? @AoA : \@AoA;
334 # Martin A. Hansen, March 2008.
336 # Given a filehandle to a 2bit file, gets the sequence
337 # or subsequence from an 2bit file entry at the given
340 # The 2bit format is described here:
341 # http://genome.ucsc.edu/FAQ/FAQformat#format7
343 my ( $fh, # filehandle
344 $offset, # byte position
345 $sub_beg, # begin of subsequence - OPTIONAL
346 $sub_len, # length of subsequence - OPTIONAL
347 $mask, # retrieve soft mask information flag - OPTIONAL
352 my ( $string, $seq_len, $n_count, $n, @n_begs, @n_sizes, $m_count, $m, @m_begs, @m_sizes, $reserved, $seq );
355 $sub_len ||= 9999999999;
357 sysseek $fh, $offset, 0;
359 $seq_len = unpack_32bit( $fh );
360 $sub_len = $seq_len if $sub_len > $seq_len;
362 $n_count = unpack_32bit( $fh );
364 map { push @n_begs, unpack_32bit( $fh ) } 1 .. $n_count;
365 map { push @n_sizes, unpack_32bit( $fh ) } 1 .. $n_count;
367 $m_count = unpack_32bit( $fh );
369 map { push @m_begs, unpack_32bit( $fh ) } 1 .. $m_count;
370 map { push @m_sizes, unpack_32bit( $fh ) } 1 .. $m_count;
372 $reserved = unpack_32bit( $fh );
374 $offset += 4 + 4 + $n_count * 8 + 4 + $m_count * 8 + 4;
376 $seq = unpack_dna( $fh, $offset, $sub_beg, $sub_len );
378 for ( $n = 0; $n < $n_count; $n++ )
380 hard_mask( $seq, $n_begs[ $n ], $n_sizes[ $n ], $sub_beg, $sub_len );
382 last if $sub_beg + $sub_len < $n_begs[ $n ];
387 for ( $m = 0; $m < $m_count; $m++ )
389 soft_mask( $seq, $m_begs[ $m ], $m_sizes[ $m ], $sub_beg, $sub_len );
391 last if $sub_beg + $sub_len < $m_begs[ $m ];
401 # Martin A. Hansen, March 2008.
403 # Reads in 8 bits from the given filehandle
404 # and returns the encoded value.
406 # NB swap still needs fixing.
408 my ( $fh, # filehandle
409 $swap, # bit order swap flag - OPTIONAL
414 my ( $string, $val );
416 sysread $fh, $string, 1;
418 $val = unpack( "C", $string );
426 # Martin A. Hansen, March 2008.
428 # Reads in 32 bits from the given filehandle
429 # and returns the encoded value.
431 my ( $fh, # filehandle
432 $swap, # bit order swap flag - OPTIONAL
437 my ( $string, $val );
439 sysread $fh, $string, 4;
442 $val = unpack( "N", $string );
444 $val = unpack( "V", $string );
453 # Martin A. Hansen, March 2008.
455 # Unpacks the DNA beginning at the given filehandle.
456 # The DNA packed to two bits per base, where the first
457 # base is in the most significant 2-bit byte; the last
458 # base is in the least significant 2 bits. The packed
459 # DNA field is padded with 0 bits as necessary to take
460 # an even multiple of 32 bits in the file.
462 # NB swap still needs fixing.
464 my ( $fh, # filehandle
465 $offset, # file offset
467 $len, # sequence length
468 $swap, # bit order swap flag - OPTIONAL
473 my ( $bin, $bin_beg, $bin_len, $dna, $bin_diff, $len_diff );
475 $bin_beg = int( $beg / 4 );
476 $bin_beg-- if $beg % 4;
477 $bin_beg = 0 if $bin_beg < 0;
479 $bin_len = int( $len / 4 );
480 $bin_len++ if $len % 4;
482 sysseek $fh, $offset + $bin_beg, 0;
483 sysread $fh, $bin, $bin_len;
485 $dna = bin2dna( $bin, $bin_len );
487 $bin_diff = $beg - $bin_beg * 4;
488 $len_diff = $bin_len * 4 - $len;
490 $dna =~ s/^.{$bin_diff}// if $bin_diff;
491 $dna =~ s/.{$len_diff}$// if $len_diff;
499 # Martin A. Hansen, March 2008.
501 # Converts a FASTA file to 2bit format.
503 my ( $fh_in, # file handle to FASTA file
504 $fh_out, # output file handle - OPTIONAL
505 $mask, # preserver soft masking - OPTIONAL
508 my ( $seq_offset, $offset, $entry, $mask_index, $seq_len, $seq_name_len, $pack_len, $rec_len, $index, $bin, $seq );
510 $fh_out = \*STDOUT if not $fh_out;
512 # ---- Creating content index ----
514 $seq_offset = 0; # offset for reading sequence from FASTA file
515 $offset = 16; # offset starting after header line which is 16 bytes
517 while ( $entry = Maasha::Fasta::get_entry( $fh_in ) )
519 $seq_len = length $entry->[ SEQ ];
520 $seq_name_len = length $entry->[ SEQ_NAME ];
522 $mask_index = mask_locate( $entry->[ SEQ ], $mask );
524 $pack_len = ( $seq_len + ( 4 - ( $seq_len ) % 4 ) ) / 4;
529 + 4 * $mask_index->{ "N_COUNT" } # N begins
530 + 4 * $mask_index->{ "N_COUNT" } # N lengths
532 + 4 * $mask_index->{ "M_COUNT" } # M begins
533 + 4 * $mask_index->{ "M_COUNT" } # M lengths
535 + $pack_len # Packed DNA - 32 bit multiplum of 2 bit/base sequence in bytes
539 SEQ_NAME => $entry->[ SEQ_NAME ],
540 SEQ_NAME_LEN => $seq_name_len,
541 SEQ_BEG => $seq_offset + $seq_name_len + 2,
543 N_COUNT => $mask_index->{ "N_COUNT" },
544 N_BEGS => $mask_index->{ "N_BEGS" },
545 N_LENS => $mask_index->{ "N_LENS" },
546 M_COUNT => $mask_index->{ "M_COUNT" },
547 M_BEGS => $mask_index->{ "M_BEGS" },
548 M_LENS => $mask_index->{ "M_LENS" },
553 + 1 # 1 byte SEQ_NAME size
554 + $seq_name_len # SEQ_NAME depending on SEQ_NAME size
555 + 4 # 32 bit offset position of sequence record
558 $seq_offset += $seq_name_len + 2 + $seq_len + 1;
561 # ---- Printing Header ----
563 $bin = pack( "V4", oct "0x1A412743", "0", scalar @{ $index }, 0 ); # signature, version, sequence count and reserved
567 # ---- Printing TOC ----
571 foreach $entry ( @{ $index } )
573 $bin .= pack( "C", $entry->{ "SEQ_NAME_LEN" } ); # 1 byte SEQ_NAME size
574 $bin .= pack( qq(A$entry->{ "SEQ_NAME_LEN" }), $entry->{ "SEQ_NAME" } ); # SEQ_NAME depending on SEQ_NAME size
575 $bin .= pack( "V", $offset ); # 32 bit offset position of sequence record
577 $offset += $entry->{ "REC_LEN" };
582 # ---- Printing Records ----
584 foreach $entry ( @{ $index } )
588 $bin .= pack( "V", $entry->{ "SEQ_LEN" } );
589 $bin .= pack( "V", $entry->{ "N_COUNT" } );
591 map { $bin .= pack( "V", $_ ) } @{ $entry->{ "N_BEGS" } };
592 map { $bin .= pack( "V", $_ ) } @{ $entry->{ "N_LENS" } };
594 $bin .= pack( "V", $entry->{ "M_COUNT" } );
596 map { $bin .= pack( "V", $_ ) } @{ $entry->{ "M_BEGS" } };
597 map { $bin .= pack( "V", $_ ) } @{ $entry->{ "M_LENS" } };
599 $bin .= pack( "V", 0 );
601 sysseek $fh_in, $entry->{ "SEQ_BEG" }, 0;
602 sysread $fh_in, $seq, $entry->{ "SEQ_LEN" };
605 $seq =~ tr/RYWSMKHDVBN/TTTTTTTTTTT/;
607 $bin .= pack_dna( $seq );
619 # Martin A. Hansen, March 2008.
621 # Packs a DNA sequence into a bit array, The DNA packed to two bits per base,
622 # represented as so: T - 00, C - 01, A - 10, G - 11. The first base is
623 # in the most significant 2-bit byte; the last base is in the least significant
624 # 2 bits. For example, the sequence TCAG is represented as 00011011.
625 # The packedDna field is padded with 0 bits as necessary to take an even
626 # multiple of 32 bits in the file.
628 my ( $dna, # dna string to pack
635 $dna .= "T" x ( 4 - ( length( $dna ) % 4 ) );
637 $bin = dna2bin( $dna, length $dna );
645 # Martin A. Hansen, March 2008.
647 # Locate N-blocks and M-blocks in a given sequence.
648 # These blocks a continously streches of Ns and Ms in a string,
649 # and the begins and lenghts of these blocks are saved in a
650 # hash along with the count of each block type.
652 my ( $seq, # Sequence
653 $mask, # preserve soft masking flag - OPTIONAL
658 my ( $n_mask, $m_mask, $seq_len, $pos, $n_beg, $n_len, $m_beg, $m_len, @n_begs, @n_lens, @m_begs, @m_lens, %mask_hash );
660 $seq =~ tr/atcgunRYWSMKHDVBrywsmkhdvb/MMMMMNNNNNNNNNNNNNNNNNNNNN/;
662 $n_mask = 1; # always mask Ns.
663 $m_mask = $mask || 0;
665 $seq_len = length $seq;
669 while ( $n_mask or $m_mask )
673 $n_beg = find_block_beg( $seq, "N", $pos, $seq_len );
675 $n_mask = 0 if $n_beg < 0;
680 $m_beg = find_block_beg( $seq, "M", $pos, $seq_len );
682 $m_mask = 0 if $m_beg < 0;
685 if ( $n_mask and $m_mask )
687 if ( $n_beg < $m_beg )
689 $n_len = find_block_len( $seq, "N", $n_beg, $seq_len );
691 push @n_begs, $n_beg;
692 push @n_lens, $n_len;
694 $pos = $n_beg + $n_len;
698 $m_len = find_block_len( $seq, "M", $m_beg, $seq_len );
700 push @m_begs, $m_beg;
701 push @m_lens, $m_len;
703 $pos = $m_beg + $m_len;
708 $n_len = find_block_len( $seq, "N", $n_beg, $seq_len );
710 push @n_begs, $n_beg;
711 push @n_lens, $n_len;
713 $pos = $n_beg + $n_len;
717 $m_len = find_block_len( $seq, "M", $m_beg, $seq_len );
719 push @m_begs, $m_beg;
720 push @m_lens, $m_len;
722 $pos = $m_beg + $m_len;
731 N_COUNT => scalar @n_begs,
732 N_BEGS => [ @n_begs ],
733 N_LENS => [ @n_lens ],
734 M_COUNT => scalar @m_begs,
735 M_BEGS => [ @m_begs ],
736 M_LENS => [ @m_lens ],
739 return wantarray ? %mask_hash : \%mask_hash;
743 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<