1 package Maasha::TwoBit;
3 # Copyright (C) 2008 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Stuff for interacting with the 2bit format as described here:
26 # http://genome.ucsc.edu/FAQ/FAQformat#format7
29 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
35 use vars qw( @ISA @EXPORT );
39 use Inline ( C => <<'END_C', DIRECTORY => $ENV{ "BP_TMP" } );
41 int find_block_beg( char *string, char c, int beg, int len )
43 /* Martin A. Hansen, March 2008 */
45 /* Given a string and a begin position, locates the next */
46 /* position in the string MATCHING a given char. */
47 /* This position is returned. If the char is not found -1 is returned. */
51 for ( i = beg; i < len; i++ )
53 if ( string[ i ] == c ) {
62 int find_block_len( char *string, char c, int beg, int len )
64 /* Martin A. Hansen, March 2008 */
66 /* Given a string and a begin position, locates the next length of */
67 /* a block consisting of a given char. The length of that block is returned. */
73 while ( i < len && string[ i ] == c )
82 char l2n[26] = { 2, 255, 1, 255, 255, 255, 3, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 255, 255, 255, 255, 255, 255 };
84 void dna2bin( char *raw, int size )
86 /* Khisanth from #perl March 2008 */
88 /* Encodes a DNA string to a bit array */
92 unsigned char packed_value = 0;
93 char *packed = malloc( size / 4 );
96 for( i = 0; i < size / 4; i++ ) {
97 packed_value = l2n[ raw[i*4] - 'A' ] << 6
98 | l2n[ raw[i*4+1] - 'A' ] << 4
99 | l2n[ raw[i*4+2] - 'A' ] << 2
100 | l2n[ raw[i*4+3] - 'A' ];
101 packed[i] = packed_value;
105 Inline_Stack_Push(sv_2mortal(newSVpvn(packed, size / 4 )));
112 "TTTT", "TTTC", "TTTA", "TTTG", "TTCT", "TTCC", "TTCA", "TTCG", "TTAT",
113 "TTAC", "TTAA", "TTAG", "TTGT", "TTGC", "TTGA", "TTGG", "TCTT", "TCTC",
114 "TCTA", "TCTG", "TCCT", "TCCC", "TCCA", "TCCG", "TCAT", "TCAC", "TCAA",
115 "TCAG", "TCGT", "TCGC", "TCGA", "TCGG", "TATT", "TATC", "TATA", "TATG",
116 "TACT", "TACC", "TACA", "TACG", "TAAT", "TAAC", "TAAA", "TAAG", "TAGT",
117 "TAGC", "TAGA", "TAGG", "TGTT", "TGTC", "TGTA", "TGTG", "TGCT", "TGCC",
118 "TGCA", "TGCG", "TGAT", "TGAC", "TGAA", "TGAG", "TGGT", "TGGC", "TGGA",
119 "TGGG", "CTTT", "CTTC", "CTTA", "CTTG", "CTCT", "CTCC", "CTCA", "CTCG",
120 "CTAT", "CTAC", "CTAA", "CTAG", "CTGT", "CTGC", "CTGA", "CTGG", "CCTT",
121 "CCTC", "CCTA", "CCTG", "CCCT", "CCCC", "CCCA", "CCCG", "CCAT", "CCAC",
122 "CCAA", "CCAG", "CCGT", "CCGC", "CCGA", "CCGG", "CATT", "CATC", "CATA",
123 "CATG", "CACT", "CACC", "CACA", "CACG", "CAAT", "CAAC", "CAAA", "CAAG",
124 "CAGT", "CAGC", "CAGA", "CAGG", "CGTT", "CGTC", "CGTA", "CGTG", "CGCT",
125 "CGCC", "CGCA", "CGCG", "CGAT", "CGAC", "CGAA", "CGAG", "CGGT", "CGGC",
126 "CGGA", "CGGG", "ATTT", "ATTC", "ATTA", "ATTG", "ATCT", "ATCC", "ATCA",
127 "ATCG", "ATAT", "ATAC", "ATAA", "ATAG", "ATGT", "ATGC", "ATGA", "ATGG",
128 "ACTT", "ACTC", "ACTA", "ACTG", "ACCT", "ACCC", "ACCA", "ACCG", "ACAT",
129 "ACAC", "ACAA", "ACAG", "ACGT", "ACGC", "ACGA", "ACGG", "AATT", "AATC",
130 "AATA", "AATG", "AACT", "AACC", "AACA", "AACG", "AAAT", "AAAC", "AAAA",
131 "AAAG", "AAGT", "AAGC", "AAGA", "AAGG", "AGTT", "AGTC", "AGTA", "AGTG",
132 "AGCT", "AGCC", "AGCA", "AGCG", "AGAT", "AGAC", "AGAA", "AGAG", "AGGT",
133 "AGGC", "AGGA", "AGGG", "GTTT", "GTTC", "GTTA", "GTTG", "GTCT", "GTCC",
134 "GTCA", "GTCG", "GTAT", "GTAC", "GTAA", "GTAG", "GTGT", "GTGC", "GTGA",
135 "GTGG", "GCTT", "GCTC", "GCTA", "GCTG", "GCCT", "GCCC", "GCCA", "GCCG",
136 "GCAT", "GCAC", "GCAA", "GCAG", "GCGT", "GCGC", "GCGA", "GCGG", "GATT",
137 "GATC", "GATA", "GATG", "GACT", "GACC", "GACA", "GACG", "GAAT", "GAAC",
138 "GAAA", "GAAG", "GAGT", "GAGC", "GAGA", "GAGG", "GGTT", "GGTC", "GGTA",
139 "GGTG", "GGCT", "GGCC", "GGCA", "GGCG", "GGAT", "GGAC", "GGAA", "GGAG",
140 "GGGT", "GGGC", "GGGA", "GGGG"
144 void bin2dna( char *raw, int size )
146 /* Khisanth from #perl, March 2008 */
148 /* Converts a bit array to DNA which is returned. */
151 char *unpacked = malloc( 4 * size + 1 );
154 unsigned char conv_index;
157 for( i = 0; i < size; i++ ) {
158 memset( &conv_index, raw[i], 1 );
159 memcpy( unpacked + i*4, conv[conv_index], 4);
163 Inline_Stack_Push(sv_2mortal(newSVpvn(unpacked, 4 * size)));
169 void bin2dna_old( char *bin, int bin_len )
171 /* Martin A. Hansen, March 2008 */
173 /* Converts a binary string to DNA which is returned. */
179 char *dna = ( char* )( malloc( bin_len / 2 ) );
183 for ( i = 1; i < bin_len; i += 2 )
185 if ( bin[ i - 1 ] == '1' )
187 if ( bin[ i ] == '1' ) {
195 if ( bin[ i ] == '1' ) {
206 Inline_Stack_Push( sv_2mortal( newSVpvn( dna, ( bin_len / 2 ) ) ) );
213 void hard_mask( char *seq, int beg, int len, int sub_beg, int sub_len )
215 /* Martin A. Hansen, March 2008 */
217 /* Hard masks a sequnce in a given interval, which is trimmed, */
218 /* if it does not match the sequence. */
220 int i, mask_beg, mask_len;
222 if ( sub_beg + sub_len >= beg && sub_beg <= beg + len )
224 mask_beg = beg - sub_beg;
226 if ( mask_beg < 0 ) {
232 if ( sub_len < mask_len ) {
236 for ( i = mask_beg; i < mask_beg + mask_len; i++ ) {
243 void soft_mask( char *seq, int beg, int len, int sub_beg, int sub_len )
245 /* Martin A. Hansen, March 2008 */
247 /* Soft masks a sequnce in a given interval, which is trimmed, */
248 /* if it does not match the sequence. */
250 int i, mask_beg, mask_len;
252 if ( sub_beg + sub_len >= beg && sub_beg <= beg + len )
254 mask_beg = beg - sub_beg;
256 if ( mask_beg < 0 ) {
262 if ( sub_len < mask_len ) {
266 for ( i = mask_beg; i < mask_beg + mask_len; i++ ) {
267 seq[ i ] = seq[ i ] ^ ' ';
284 @ISA = qw( Exporter );
287 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
292 # Martin A. Hansen, March 2008.
294 # Fetches the table of contents (TOC) from a 2bit file.
295 # The TOC is returned as a list of lists.
297 # The 2bit format is described here:
298 # http://genome.ucsc.edu/FAQ/FAQformat#format7
300 my ( $fh, # filehandle
305 my ( $signature, $version, $seq_count, $reserved, $i, $seq_name_size, $string, $seq_name, $offset, @AoA );
309 $signature = unpack_32bit( $fh );
310 $version = unpack_32bit( $fh );
311 $seq_count = unpack_32bit( $fh );
312 $reserved = unpack_32bit( $fh );
314 Maasha::Common::error( qq(2bit file signature didn't match - inverse bit order?) ) if $signature != 0x1A412743;
316 for ( $i = 0; $i < $seq_count; $i++ )
318 $seq_name_size = unpack_8bit( $fh );
320 sysread $fh, $string, $seq_name_size;
322 $seq_name = unpack( "A$seq_name_size", $string );
324 $offset = unpack_32bit( $fh );
326 push @AoA, [ $seq_name, $offset ];
329 return wantarray ? @AoA : \@AoA;
335 # Martin A. Hansen, March 2008.
337 # Given a filehandle to a 2bit file, gets the sequence
338 # or subsequence from an 2bit file entry at the given
341 # The 2bit format is described here:
342 # http://genome.ucsc.edu/FAQ/FAQformat#format7
344 my ( $fh, # filehandle
345 $offset, # byte position
346 $sub_beg, # begin of subsequence - OPTIONAL
347 $sub_len, # length of subsequence - OPTIONAL
348 $mask, # retrieve soft mask information flag - OPTIONAL
353 my ( $string, $seq_len, $n_count, $n, @n_begs, @n_sizes, $m_count, $m, @m_begs, @m_sizes, $reserved, $seq );
356 $sub_len ||= 9999999999;
358 sysseek $fh, $offset, 0;
360 $seq_len = unpack_32bit( $fh );
361 $sub_len = $seq_len if $sub_len > $seq_len;
363 $n_count = unpack_32bit( $fh );
365 map { push @n_begs, unpack_32bit( $fh ) } 1 .. $n_count;
366 map { push @n_sizes, unpack_32bit( $fh ) } 1 .. $n_count;
368 $m_count = unpack_32bit( $fh );
370 map { push @m_begs, unpack_32bit( $fh ) } 1 .. $m_count;
371 map { push @m_sizes, unpack_32bit( $fh ) } 1 .. $m_count;
373 $reserved = unpack_32bit( $fh );
375 $offset += 4 + 4 + $n_count * 8 + 4 + $m_count * 8 + 4;
377 $seq = unpack_dna( $fh, $offset, $sub_beg, $sub_len );
379 for ( $n = 0; $n < $n_count; $n++ )
381 hard_mask( $seq, $n_begs[ $n ], $n_sizes[ $n ], $sub_beg, $sub_len );
383 last if $sub_beg + $sub_len < $n_begs[ $n ];
388 for ( $m = 0; $m < $m_count; $m++ )
390 soft_mask( $seq, $m_begs[ $m ], $m_sizes[ $m ], $sub_beg, $sub_len );
392 last if $sub_beg + $sub_len < $m_begs[ $m ];
402 # Martin A. Hansen, March 2008.
404 # Reads in 8 bits from the given filehandle
405 # and returns the encoded value.
407 # NB swap still needs fixing.
409 my ( $fh, # filehandle
410 $swap, # bit order swap flag - OPTIONAL
415 my ( $string, $val );
417 sysread $fh, $string, 1;
419 $val = unpack( "C", $string );
427 # Martin A. Hansen, March 2008.
429 # Reads in 32 bits from the given filehandle
430 # and returns the encoded value.
432 my ( $fh, # filehandle
433 $swap, # bit order swap flag - OPTIONAL
438 my ( $string, $val );
440 sysread $fh, $string, 4;
443 $val = unpack( "N", $string );
445 $val = unpack( "V", $string );
454 # Martin A. Hansen, March 2008.
456 # Unpacks the DNA beginning at the given filehandle.
457 # The DNA packed to two bits per base, where the first
458 # base is in the most significant 2-bit byte; the last
459 # base is in the least significant 2 bits. The packed
460 # DNA field is padded with 0 bits as necessary to take
461 # an even multiple of 32 bits in the file.
463 # NB swap still needs fixing.
465 my ( $fh, # filehandle
466 $offset, # file offset
468 $len, # sequence length
469 $swap, # bit order swap flag - OPTIONAL
474 my ( $bin, $bin_beg, $bin_len, $dna, $bin_diff, $len_diff );
476 $bin_beg = int( $beg / 4 );
477 $bin_beg-- if $beg % 4;
478 $bin_beg = 0 if $bin_beg < 0;
480 $bin_len = int( $len / 4 );
481 $bin_len++ if $len % 4;
483 sysseek $fh, $offset + $bin_beg, 0;
484 sysread $fh, $bin, $bin_len;
486 $dna = bin2dna( $bin, $bin_len );
488 $bin_diff = $beg - $bin_beg * 4;
489 $len_diff = $bin_len * 4 - $len;
491 $dna =~ s/^.{$bin_diff}// if $bin_diff;
492 $dna =~ s/.{$len_diff}$// if $len_diff;
500 # Martin A. Hansen, March 2008.
502 # Converts a FASTA file to 2bit format.
504 my ( $fh_in, # file handle to FASTA file
505 $fh_out, # output file handle - OPTIONAL
506 $mask, # preserver soft masking - OPTIONAL
509 my ( $seq_offset, $offset, $entry, $mask_index, $seq_len, $seq_name_len, $pack_len, $rec_len, $index, $bin, $seq );
511 $fh_out = \*STDOUT if not $fh_out;
513 # ---- Creating content index ----
515 $seq_offset = 0; # offset for reading sequence from FASTA file
516 $offset = 16; # offset starting after header line which is 16 bytes
518 while ( $entry = Maasha::Fasta::get_entry( $fh_in ) )
520 $seq_len = length $entry->[ SEQ ];
521 $seq_name_len = length $entry->[ SEQ_NAME ];
523 $mask_index = mask_locate( $entry->[ SEQ ], $mask );
525 $pack_len = ( $seq_len + ( 4 - ( $seq_len ) % 4 ) ) / 4;
530 + 4 * $mask_index->{ "N_COUNT" } # N begins
531 + 4 * $mask_index->{ "N_COUNT" } # N lengths
533 + 4 * $mask_index->{ "M_COUNT" } # M begins
534 + 4 * $mask_index->{ "M_COUNT" } # M lengths
536 + $pack_len # Packed DNA - 32 bit multiplum of 2 bit/base sequence in bytes
540 SEQ_NAME => $entry->[ SEQ_NAME ],
541 SEQ_NAME_LEN => $seq_name_len,
542 SEQ_BEG => $seq_offset + $seq_name_len + 2,
544 N_COUNT => $mask_index->{ "N_COUNT" },
545 N_BEGS => $mask_index->{ "N_BEGS" },
546 N_LENS => $mask_index->{ "N_LENS" },
547 M_COUNT => $mask_index->{ "M_COUNT" },
548 M_BEGS => $mask_index->{ "M_BEGS" },
549 M_LENS => $mask_index->{ "M_LENS" },
554 + 1 # 1 byte SEQ_NAME size
555 + $seq_name_len # SEQ_NAME depending on SEQ_NAME size
556 + 4 # 32 bit offset position of sequence record
559 $seq_offset += $seq_name_len + 2 + $seq_len + 1;
562 # ---- Printing Header ----
564 $bin = pack( "V4", oct "0x1A412743", "0", scalar @{ $index }, 0 ); # signature, version, sequence count and reserved
568 # ---- Printing TOC ----
572 foreach $entry ( @{ $index } )
574 $bin .= pack( "C", $entry->{ "SEQ_NAME_LEN" } ); # 1 byte SEQ_NAME size
575 $bin .= pack( qq(A$entry->{ "SEQ_NAME_LEN" }), $entry->{ "SEQ_NAME" } ); # SEQ_NAME depending on SEQ_NAME size
576 $bin .= pack( "V", $offset ); # 32 bit offset position of sequence record
578 $offset += $entry->{ "REC_LEN" };
583 # ---- Printing Records ----
585 foreach $entry ( @{ $index } )
589 $bin .= pack( "V", $entry->{ "SEQ_LEN" } );
590 $bin .= pack( "V", $entry->{ "N_COUNT" } );
592 map { $bin .= pack( "V", $_ ) } @{ $entry->{ "N_BEGS" } };
593 map { $bin .= pack( "V", $_ ) } @{ $entry->{ "N_LENS" } };
595 $bin .= pack( "V", $entry->{ "M_COUNT" } );
597 map { $bin .= pack( "V", $_ ) } @{ $entry->{ "M_BEGS" } };
598 map { $bin .= pack( "V", $_ ) } @{ $entry->{ "M_LENS" } };
600 $bin .= pack( "V", 0 );
602 sysseek $fh_in, $entry->{ "SEQ_BEG" }, 0;
603 sysread $fh_in, $seq, $entry->{ "SEQ_LEN" };
606 $seq =~ tr/RYWSMKHDVBN/TTTTTTTTTTT/;
608 $bin .= pack_dna( $seq );
620 # Martin A. Hansen, March 2008.
622 # Packs a DNA sequence into a bit array, The DNA packed to two bits per base,
623 # represented as so: T - 00, C - 01, A - 10, G - 11. The first base is
624 # in the most significant 2-bit byte; the last base is in the least significant
625 # 2 bits. For example, the sequence TCAG is represented as 00011011.
626 # The packedDna field is padded with 0 bits as necessary to take an even
627 # multiple of 32 bits in the file.
629 my ( $dna, # dna string to pack
636 $dna .= "T" x ( 4 - ( length( $dna ) % 4 ) );
638 $bin = dna2bin( $dna, length $dna );
646 # Martin A. Hansen, March 2008.
648 # Locate N-blocks and M-blocks in a given sequence.
649 # These blocks a continously streches of Ns and Ms in a string,
650 # and the begins and lenghts of these blocks are saved in a
651 # hash along with the count of each block type.
653 my ( $seq, # Sequence
654 $mask, # preserve soft masking flag - OPTIONAL
659 my ( $n_mask, $m_mask, $seq_len, $pos, $n_beg, $n_len, $m_beg, $m_len, @n_begs, @n_lens, @m_begs, @m_lens, %mask_hash );
661 $seq =~ tr/atcgunRYWSMKHDVBrywsmkhdvb/MMMMMNNNNNNNNNNNNNNNNNNNNN/;
663 $n_mask = 1; # always mask Ns.
664 $m_mask = $mask || 0;
666 $seq_len = length $seq;
670 while ( $n_mask or $m_mask )
674 $n_beg = find_block_beg( $seq, "N", $pos, $seq_len );
676 $n_mask = 0 if $n_beg < 0;
681 $m_beg = find_block_beg( $seq, "M", $pos, $seq_len );
683 $m_mask = 0 if $m_beg < 0;
686 if ( $n_mask and $m_mask )
688 if ( $n_beg < $m_beg )
690 $n_len = find_block_len( $seq, "N", $n_beg, $seq_len );
692 push @n_begs, $n_beg;
693 push @n_lens, $n_len;
695 $pos = $n_beg + $n_len;
699 $m_len = find_block_len( $seq, "M", $m_beg, $seq_len );
701 push @m_begs, $m_beg;
702 push @m_lens, $m_len;
704 $pos = $m_beg + $m_len;
709 $n_len = find_block_len( $seq, "N", $n_beg, $seq_len );
711 push @n_begs, $n_beg;
712 push @n_lens, $n_len;
714 $pos = $n_beg + $n_len;
718 $m_len = find_block_len( $seq, "M", $m_beg, $seq_len );
720 push @m_begs, $m_beg;
721 push @m_lens, $m_len;
723 $pos = $m_beg + $m_len;
732 N_COUNT => scalar @n_begs,
733 N_BEGS => [ @n_begs ],
734 N_LENS => [ @n_lens ],
735 M_COUNT => scalar @m_begs,
736 M_BEGS => [ @m_begs ],
737 M_LENS => [ @m_lens ],
740 return wantarray ? %mask_hash : \%mask_hash;
744 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<